#consider real estate data from the city of Ames, Iowa. The details of every real estate transaction in Ames is recorded by the City Assessor’s office. Our particular focus for this lab will be all residential home sales in Ames between 2006 and 2010. This collection represents our population of interest. In this lab we would like to learn about these home sales by taking smaller samples from the full population. Let’s load the data.
library(openintro)
## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.0.6 ✓ dplyr 1.0.4
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
library(patchwork)
getwd()
## [1] "/Users/Darwinbeliever01/Desktop/math 217/week_5"
ames <-read.csv("/Users/Darwinbeliever01/Desktop/math 217/week_5/ames.csv")
area <- ames$Gr.Liv.Area
price <- ames$SalePrice
#Let’s look at the distribution of area in our population of home sales by calculating a few summary statistics and making a histogram.
sumarea<-summary(area)
sumprice<-summary(price)
print(sumarea)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 334 1126 1442 1500 1743 5642
print(sumprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 12789 129500 160000 180796 213500 755000
areamean <- mean(area)
areasd <- sd(area)
print(areamean)
## [1] 1499.69
print(areasd)
## [1] 505.5089
length(area)
## [1] 2930
length(price)
## [1] 2930
pricemean <- mean(price)
pricesd <- sd(price)
print(pricemean)
## [1] 180796.1
print(pricesd)
## [1] 79886.69
areaboxplot <-ggplot(ames, aes(x=area)) +
geom_boxplot() +ggtitle("distribution household living area") +
geom_vline(xintercept=areamean, col = "red", lwd = 2)
priceboxplot <-ggplot(ames, aes(x=price)) +
geom_boxplot() +ggtitle("distribution household price") +
geom_vline(xintercept=pricemean, col = "red", lwd = 2)
areaboxplot + priceboxplot
areahistogram <-ggplot(data = ames, aes(x = area)) +
geom_blank() +
geom_histogram(bins = 10, aes(y = ..density..)) +
stat_function(fun = dnorm, args = c(mean = areamean, sd = areasd), col = "tomato") +ggtitle("distribution household living area") + geom_vline(xintercept=areamean, col = "red")
pricehistogram <-ggplot(data = ames, aes(x = price)) +
geom_blank() +
geom_histogram(bins = 10, aes(y = ..density..)) +
stat_function(fun = dnorm, args = c(mean = pricemean, sd = pricesd), col = "tomato") +ggtitle("distribution household price") + geom_vline(xintercept=pricemean, col = "red")
areahistogram + pricehistogram
areaQQ <-ggplot(data = ames, aes(sample = area)) +
geom_line(stat = "qq") + stat_qq()+stat_qq_line() +
ggtitle("normal prob plot household living area")
priceQQ <-ggplot(data = ames, aes(sample = price)) +
geom_line(stat = "qq") + stat_qq()+stat_qq_line() +
ggtitle("normal prob plot household price")
areaQQ + priceQQ
#Describe this population distribution.
#area dataset has n = 2930, sample mean = 1499.69sqft, sd = 505.5089. the boxplot notes numerous outliers in the upper range. Analyzing the histograms and qqlots we see the distribution is skewed to the right.
#price dataset has n = 2930, sample mean = $180796.1 and sd = 79886.69. the boxplot notes numerous outliers in the upper range, analying the histogram and qqplots, we see the distribution is skewed to the right.
#In this lab we have access to the entire population, but this is rarely the case in real life. Gathering information on an entire population is often extremely costly or impossible. Because of this, we often take a sample of the population and use that to understand the properties of the population.
#estimate mean living area in Ames data. first select a sample n = 50
samp1 <- sample(area, 50)
print(samp1)
## [1] 2057 1456 910 2555 1595 1114 2402 1677 990 864 1540 1072 2392 1155 1625
## [16] 1911 1290 2039 1069 2032 1368 1829 1734 1891 995 2161 1476 1369 894 1534
## [31] 1428 1656 1167 1251 2046 1250 2167 2345 630 1513 1208 958 1601 1343 1302
## [46] 854 912 1315 1338 1382
sampl1 <-data_frame(samp1)
## Warning: `data_frame()` is deprecated as of tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
#now assess dataset sampl1
length(samp1)
## [1] 50
sumsamp1 <-summary(samp1)
print(sumsamp1)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 630 1158 1405 1493 1805 2555
meansamp1 <-mean(samp1)
sdsamp1 <- sd(samp1)
print(meansamp1)
## [1] 1493.24
print(sdsamp1)
## [1] 467.449
hist(samp1)
boxplot(samp1)
sampl1QQ <-ggplot(data = sampl1, aes(sample = samp1)) +
geom_line(stat = "qq") + stat_qq()+stat_qq_line() +
ggtitle("normal prob plot household living area")
sampl1QQ
### Exercise 2
#Describe the distribution of this sample. How does it compare to the distribution of the population?
#Answer: the sample distribution has n = 50, mean = 1392.08 sqft, and sd = 461.7023. the boxplot notes outliers at the upper range, the histogram and qqplot are skewed to the right. there the sample and population are have similar parameters.
#Take a second sample, also of size 50, and call it samp2 .
samp2 <- sample(area, 50)
print(samp2)
## [1] 1250 1621 2153 2169 2201 1405 1797 1646 1800 1357 1525 1079 1480 1774 2385
## [16] 876 1471 1360 1072 864 1330 1268 1702 1092 725 1851 1991 1548 2624 1332
## [31] 1801 2329 1499 1114 1498 2520 1953 796 2614 1260 1271 1960 990 2256 816
## [46] 1116 816 1093 1125 1294
sample2<-data_frame(samp2)
sumsamp2 <-summary(sample2)
print(sumsamp2)
## samp2
## Min. : 725
## 1st Qu.:1118
## Median :1476
## Mean :1537
## 3rd Qu.:1838
## Max. :2624
sdsample2 <- sd(samp2)
print(sdsample2)
## [1] 508.6744
hist((samp2))
ggplot(data = sample2, aes(sample = samp2)) +
geom_line(stat = "qq") + stat_qq()+stat_qq_line() +
ggtitle("normal prob plot household living area sample2")
#How does the mean of samp2 compare with the mean of samp1 ? Suppose we took two more samples, one of size 100 and one of size 1000. Which would you think would provide a more accurate estimate of the population mean?
#Answer: comparing the sampl1 and sampl2 the means and sd are similar. the distribution are similiarly skewed to the right. Taking more samples wiht higher n would lead to the distribution sample means more closely approaching the population mean and the sds narrowing.
#generate 5000 samples and compute the sample mean of each.
sample_means50 <- rep(NA, 5000)
for(i in 1:5000){samp <- sample(area, 50)
sample_means50[i] <- mean(samp)}
hist(sample_means50, breaks = 50)
sample_means50 <-data_frame(sample_means50)
#How many elements are there in sample_means50 ? Describe the sampling distribution, and be sure to specifically note its center. Would you expect the distribution to change if we instead collected 50,000 sample means?
#Answer: there are 5000 elements, each represent the mean of n = 50 run by for loop 5000 times. if we ran the for loop 50,000 times we would generate means that would more closely approach the ideal normal distribution
sample_means50000 <- rep(NA, 50000)
for (i in 1:50000){
samp <-sample(area, 50)
sample_means50000[i] <-mean(samp)
}
hist(sample_means50000, breaks = 50)
### Exercise 5
#To make sure you understand what you’ve done in this loop, try running a smaller version. Initialize a vector of 100 zeros called sample_means_small . Run a loop that takes a sample of size 50 from area and stores the sample mean in sample_means_small , but only iterate from 1 to 100. Print the output to your screen (type sample_means_small into the console and press enter). How many elements are there in this object called sample_means_small ? What does each element represent? #answer: there are 100 elements with each element representing mean of 50 samples run by the for loop 100 times.
sample_mean100 <-rep(NA, 100)
for (i in 1:100){
samp <-sample(area, 50)
sample_mean100[i] <-mean(samp)
}
hist(sample_mean100, breaks = 50)
sample_means10 <- rep(NA, 5000)
sample_means100 <- rep(NA, 5000)
for(i in 1:5000){
samp <- sample(area, 10)
sample_means10[i] <- mean(samp)
samp <- sample(area, 100)
sample_means100[i] <- mean(samp)
}
#When the sample size is larger, what happens to the center? What about the spread?
#The central limit theorem tells us the mean of sampling distribution Y bar is equal to the population mean mu. Further the sd of the sampling distribution is equal to the population sd / sqrt(n). this is an indirect relationship such that as n increases over subsequent samples the sd will decrease. Therefore the dispersion of data on this histogram will decrease.
#So far, we have only focused on estimating the mean living area in homes in Ames. Now you’ll try to estimate the mean home price.
#Take a random sample of size 50 from price . Using this sample, what is your best point estimate of the population mean?
sample_price50 <-rep(NA, 5000)
for (i in 1:5000){
samp <-sample(price, 50)
sample_price50[i] <-mean(samp)
}
hist(sample_price50, breaks = 50)
meansample_price50 <-mean(sample_price50)
sdsample_price50 <-sd(sample_price50)
print(meansample_price50)
## [1] 180668.4
print(sdsample_price50)
## [1] 11384.21
#the sampling distribution above appears normally distributed so sample mean and population mean should be approximately same at 180938.4 based on the central limit theorem.
#Since you have access to the population, simulate the sampling distribution for x price samples from the population of size 50 and computing 5000 sample means. Store these means in a vector called sample_means50 . Plot the data, then describe the shape of this sampling distribution. Based on this sampling distribution, what would you guess the mean home price of the population to be? Finally, calculate and report the population mean.
#Answer the sampling distribution above appears normally distributed so sample mean and population mean should be same at 180938.4
#Change your sample size from 50 to 150, then compute the sampling distribution using the same method as above, and store these means in a new vector called sample_means150 . Describe the shape of this sampling distribution, and compare it to the sampling distribution for a sample size of 50. Based on this sampling distribution, what would you guess to be the mean sale price of homes in Ames?
sample_price150 <-rep(NA, 5000)
for (i in 1:5000){
samp <-sample(price, 150)
sample_price150[i] <-mean(samp)
}
meansample_price150 <-mean(sample_price150)
sdsample_price150 <- sd(sample_price150)
print(meansample_price150)
## [1] 180782.2
print(sdsample_price150)
## [1] 6319.575
hist(sample_price150, breaks = 50)
#the sampling distributions n= 50 and n = 150 have means of 180938.4 and 180746.3 respectively. the distributions both appear normal. the ames population mean is approximately same at the sampling distribution means. From the Central limit theorem, if we increase the sampling distribution n the sampling distribution mean will more closely approximate the population mean.
#Of the sampling distributions from 2 and 3, which has a smaller spread? If we’re concerned with making estimates that are more often close to the true value, would we prefer a distribution with a large or small spread? # the standard error for n = =50 and n = 150 are 11177.57 and 6356.682 respectively. This make sense as the se = sigma/sqrt(n). Se is indirectly related to the n so if we want to decrease standard error then we should increase the n.