download.file("http://www.openintro.org/stat/data/ames.RData", destfile = "ames.RData")
load("ames.RData")
head(ames)
## Order PID MS.SubClass MS.Zoning Lot.Frontage Lot.Area Street Alley
## 1 1 526301100 20 RL 141 31770 Pave <NA>
## 2 2 526350040 20 RH 80 11622 Pave <NA>
## 3 3 526351010 20 RL 81 14267 Pave <NA>
## 4 4 526353030 20 RL 93 11160 Pave <NA>
## 5 5 527105010 60 RL 74 13830 Pave <NA>
## 6 6 527105030 60 RL 78 9978 Pave <NA>
## Lot.Shape Land.Contour Utilities Lot.Config Land.Slope Neighborhood
## 1 IR1 Lvl AllPub Corner Gtl NAmes
## 2 Reg Lvl AllPub Inside Gtl NAmes
## 3 IR1 Lvl AllPub Corner Gtl NAmes
## 4 Reg Lvl AllPub Corner Gtl NAmes
## 5 IR1 Lvl AllPub Inside Gtl Gilbert
## 6 IR1 Lvl AllPub Inside Gtl Gilbert
## Condition.1 Condition.2 Bldg.Type House.Style Overall.Qual Overall.Cond
## 1 Norm Norm 1Fam 1Story 6 5
## 2 Feedr Norm 1Fam 1Story 5 6
## 3 Norm Norm 1Fam 1Story 6 6
## 4 Norm Norm 1Fam 1Story 7 5
## 5 Norm Norm 1Fam 2Story 5 5
## 6 Norm Norm 1Fam 2Story 6 6
## Year.Built Year.Remod.Add Roof.Style Roof.Matl Exterior.1st Exterior.2nd
## 1 1960 1960 Hip CompShg BrkFace Plywood
## 2 1961 1961 Gable CompShg VinylSd VinylSd
## 3 1958 1958 Hip CompShg Wd Sdng Wd Sdng
## 4 1968 1968 Hip CompShg BrkFace BrkFace
## 5 1997 1998 Gable CompShg VinylSd VinylSd
## 6 1998 1998 Gable CompShg VinylSd VinylSd
## Mas.Vnr.Type Mas.Vnr.Area Exter.Qual Exter.Cond Foundation Bsmt.Qual
## 1 Stone 112 TA TA CBlock TA
## 2 None 0 TA TA CBlock TA
## 3 BrkFace 108 TA TA CBlock TA
## 4 None 0 Gd TA CBlock TA
## 5 None 0 TA TA PConc Gd
## 6 BrkFace 20 TA TA PConc TA
## Bsmt.Cond Bsmt.Exposure BsmtFin.Type.1 BsmtFin.SF.1 BsmtFin.Type.2
## 1 Gd Gd BLQ 639 Unf
## 2 TA No Rec 468 LwQ
## 3 TA No ALQ 923 Unf
## 4 TA No ALQ 1065 Unf
## 5 TA No GLQ 791 Unf
## 6 TA No GLQ 602 Unf
## BsmtFin.SF.2 Bsmt.Unf.SF Total.Bsmt.SF Heating Heating.QC Central.Air
## 1 0 441 1080 GasA Fa Y
## 2 144 270 882 GasA TA Y
## 3 0 406 1329 GasA TA Y
## 4 0 1045 2110 GasA Ex Y
## 5 0 137 928 GasA Gd Y
## 6 0 324 926 GasA Ex Y
## Electrical X1st.Flr.SF X2nd.Flr.SF Low.Qual.Fin.SF Gr.Liv.Area
## 1 SBrkr 1656 0 0 1656
## 2 SBrkr 896 0 0 896
## 3 SBrkr 1329 0 0 1329
## 4 SBrkr 2110 0 0 2110
## 5 SBrkr 928 701 0 1629
## 6 SBrkr 926 678 0 1604
## Bsmt.Full.Bath Bsmt.Half.Bath Full.Bath Half.Bath Bedroom.AbvGr
## 1 1 0 1 0 3
## 2 0 0 1 0 2
## 3 0 0 1 1 3
## 4 1 0 2 1 3
## 5 0 0 2 1 3
## 6 0 0 2 1 3
## Kitchen.AbvGr Kitchen.Qual TotRms.AbvGrd Functional Fireplaces
## 1 1 TA 7 Typ 2
## 2 1 TA 5 Typ 0
## 3 1 Gd 6 Typ 0
## 4 1 Ex 8 Typ 2
## 5 1 TA 6 Typ 1
## 6 1 Gd 7 Typ 1
## Fireplace.Qu Garage.Type Garage.Yr.Blt Garage.Finish Garage.Cars
## 1 Gd Attchd 1960 Fin 2
## 2 <NA> Attchd 1961 Unf 1
## 3 <NA> Attchd 1958 Unf 1
## 4 TA Attchd 1968 Fin 2
## 5 TA Attchd 1997 Fin 2
## 6 Gd Attchd 1998 Fin 2
## Garage.Area Garage.Qual Garage.Cond Paved.Drive Wood.Deck.SF
## 1 528 TA TA P 210
## 2 730 TA TA Y 140
## 3 312 TA TA Y 393
## 4 522 TA TA Y 0
## 5 482 TA TA Y 212
## 6 470 TA TA Y 360
## Open.Porch.SF Enclosed.Porch X3Ssn.Porch Screen.Porch Pool.Area Pool.QC
## 1 62 0 0 0 0 <NA>
## 2 0 0 0 120 0 <NA>
## 3 36 0 0 0 0 <NA>
## 4 0 0 0 0 0 <NA>
## 5 34 0 0 0 0 <NA>
## 6 36 0 0 0 0 <NA>
## Fence Misc.Feature Misc.Val Mo.Sold Yr.Sold Sale.Type Sale.Condition
## 1 <NA> <NA> 0 5 2010 WD Normal
## 2 MnPrv <NA> 0 6 2010 WD Normal
## 3 <NA> Gar2 12500 6 2010 WD Normal
## 4 <NA> <NA> 0 4 2010 WD Normal
## 5 MnPrv <NA> 0 3 2010 WD Normal
## 6 <NA> <NA> 0 6 2010 WD Normal
## SalePrice
## 1 215000
## 2 105000
## 3 172000
## 4 244000
## 5 189900
## 6 195500
area <- ames$Gr.Liv.Area
price <- ames$SalePrice
summary(area)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 334 1126 1442 1500 1743 5642
hist(area)
Exercise 1. Population distribution is near normal, but it is skewed to the right
sampl1<-sample(area,50)
Exercise 2. Distribution looks right skewed. Much less normal than original distribution. It has two spikes. Means of 2 distributions are close to each other 1455 vs 1500.The sample is not as much spread as original population (Mins 672 vs 334 and Maxs 2552 vs 5642)
summary(sampl1)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 720 1106 1487 1558 1910 3086
hist(sampl1)
mean(sampl1)
## [1] 1558.24
Exercise3. Mean of sample 2 was higher than mean of sample 1 (1670.82 vs 1455.16). The sample of size of 1000 would be the best estimate.
sampl2<-sample(area,50)
mean(sampl2)
## [1] 1461.8
sampl3<-sample(area,100)
mean(sampl3)
## [1] 1504.11
sampl4<-sample(area,1000)
mean(sampl4)
## [1] 1537.141
sample_means50<-rep(NA,5000)
for (i in 1:5000) {
samp<-sample(area,50)
sample_means50[i]<-mean(samp)
}
hist(sample_means50)
hist(sample_means50,breaks=25)
Exercise 4. There are 5,000 elements in our sample. The distrubution is normal. It centers around 1500. I do not expect distribution to change significantly if the size will increase. It will looks a little bit smother, more close to normal distribution.
samp<-sample(area,50)
sample_means50[1]<-mean(samp)
samp<-sample(area,50)
sample_means50[2]<-mean(samp)
samp<-sample(area,50)
sample_means50[3]<-mean(samp)
samp<-sample(area,50)
sample_means50[4]<-mean(samp)
for (i in 1:5000){
samp<-sample(area,50)
sample_means50[i]<-mean(samp)
if (i<100) {print(i)}
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11
## [1] 12
## [1] 13
## [1] 14
## [1] 15
## [1] 16
## [1] 17
## [1] 18
## [1] 19
## [1] 20
## [1] 21
## [1] 22
## [1] 23
## [1] 24
## [1] 25
## [1] 26
## [1] 27
## [1] 28
## [1] 29
## [1] 30
## [1] 31
## [1] 32
## [1] 33
## [1] 34
## [1] 35
## [1] 36
## [1] 37
## [1] 38
## [1] 39
## [1] 40
## [1] 41
## [1] 42
## [1] 43
## [1] 44
## [1] 45
## [1] 46
## [1] 47
## [1] 48
## [1] 49
## [1] 50
## [1] 51
## [1] 52
## [1] 53
## [1] 54
## [1] 55
## [1] 56
## [1] 57
## [1] 58
## [1] 59
## [1] 60
## [1] 61
## [1] 62
## [1] 63
## [1] 64
## [1] 65
## [1] 66
## [1] 67
## [1] 68
## [1] 69
## [1] 70
## [1] 71
## [1] 72
## [1] 73
## [1] 74
## [1] 75
## [1] 76
## [1] 77
## [1] 78
## [1] 79
## [1] 80
## [1] 81
## [1] 82
## [1] 83
## [1] 84
## [1] 85
## [1] 86
## [1] 87
## [1] 88
## [1] 89
## [1] 90
## [1] 91
## [1] 92
## [1] 93
## [1] 94
## [1] 95
## [1] 96
## [1] 97
## [1] 98
## [1] 99
Exercise 5.
sample_means_small<-rep(0,100)
for (i in 1:100){
samp<-sample(area,50)
sample_means_small[i]<-mean(samp)
}
sample_means_small
## [1] 1634.98 1475.74 1476.06 1610.10 1582.74 1486.46 1537.88 1506.12
## [9] 1421.48 1386.48 1435.26 1527.20 1364.58 1535.58 1427.92 1554.48
## [17] 1555.40 1495.78 1460.66 1582.22 1514.58 1494.52 1508.32 1380.32
## [25] 1521.76 1458.68 1314.30 1660.00 1590.58 1425.52 1459.66 1472.48
## [33] 1464.20 1459.42 1506.04 1501.44 1522.38 1451.20 1522.16 1567.90
## [41] 1537.76 1430.84 1587.62 1500.62 1379.44 1505.60 1489.24 1517.72
## [49] 1569.20 1557.48 1428.24 1490.26 1363.64 1478.48 1518.06 1419.00
## [57] 1513.54 1431.94 1578.26 1493.00 1541.14 1596.70 1451.60 1547.46
## [65] 1501.54 1502.18 1669.20 1471.52 1505.40 1445.74 1545.82 1500.68
## [73] 1437.10 1541.18 1474.08 1435.76 1561.72 1394.78 1469.18 1489.36
## [81] 1496.84 1512.14 1585.84 1514.24 1513.14 1546.66 1630.98 1402.00
## [89] 1553.16 1509.46 1447.58 1552.26 1396.44 1603.62 1491.14 1587.16
## [97] 1499.42 1575.22 1400.90 1535.46
hist(sample_means50)
sample_means10<-rep(NA,5000)
sample_means100<-rep(NA,5000)
for (i in 1:5000){
samp<-sample(area,10)
sample_means10[i]<-mean(samp)
samp<-sample(area,100)
sample_means100[i]<-mean(samp)
}
par(mfrow=c(3,1))
xlimits<-range(sample_means10)
hist(sample_means10, breaks=20, xlim=xlimits)
hist(sample_means50, breaks=20, xlim=xlimits)
hist(sample_means100, breaks=20, xlim=xlimits)
Exercise 6. Center does not change with increase in sample size, however spread gets smaller
On my own.
sample_price<-sample(price,50)
mean(sample_price)
## [1] 199953
sample_means50<-rep(NA,5000)
for (i in 1:5000){
samp<-sample(price,50)
sample_means50[i]<-mean(samp)
}
hist(sample_means50)
mean(price)
## [1] 180796.1
sample_means150<-rep(NA,5000)
for (i in 1:5000){
samp<-sample(price,150)
sample_means150[i]<-mean(samp)
}
hist(sample_means150)
hist(sample_means150,breaks=25)