library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.4 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.1.1 v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.1.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(here)
## Warning: package 'here' was built under R version 4.1.3
## here() starts at C:/Users/user/Documents/data
library(ggplot2)
library(dplyr)
ames <- read_csv(here('data', 'ames.csv'))
## Rows: 2930 Columns: 82
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (43): MS.Zoning, Street, Alley, Lot.Shape, Land.Contour, Utilities, Lot....
## dbl (39): Order, PID, MS.SubClass, Lot.Frontage, Lot.Area, Overall.Qual, Ove...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
ames
## # A tibble: 2,930 x 82
## Order PID MS.SubClass MS.Zoning Lot.Frontage Lot.Area Street Alley
## <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <chr> <chr>
## 1 1 526301100 20 RL 141 31770 Pave <NA>
## 2 2 526350040 20 RH 80 11622 Pave <NA>
## 3 3 526351010 20 RL 81 14267 Pave <NA>
## 4 4 526353030 20 RL 93 11160 Pave <NA>
## 5 5 527105010 60 RL 74 13830 Pave <NA>
## 6 6 527105030 60 RL 78 9978 Pave <NA>
## 7 7 527127150 120 RL 41 4920 Pave <NA>
## 8 8 527145080 120 RL 43 5005 Pave <NA>
## 9 9 527146030 120 RL 39 5389 Pave <NA>
## 10 10 527162130 60 RL 60 7500 Pave <NA>
## # ... with 2,920 more rows, and 74 more variables: Lot.Shape <chr>,
## # Land.Contour <chr>, Utilities <chr>, Lot.Config <chr>, Land.Slope <chr>,
## # Neighborhood <chr>, Condition.1 <chr>, Condition.2 <chr>, Bldg.Type <chr>,
## # House.Style <chr>, Overall.Qual <dbl>, Overall.Cond <dbl>,
## # Year.Built <dbl>, Year.Remod.Add <dbl>, Roof.Style <chr>, Roof.Matl <chr>,
## # Exterior.1st <chr>, Exterior.2nd <chr>, Mas.Vnr.Type <chr>,
## # Mas.Vnr.Area <dbl>, Exter.Qual <chr>, Exter.Cond <chr>, ...
set.seed(773952)
##1. Create a histogram of the areas variable (set binwidth = 250).
ggplot(ames) + ggtitle("Histogram displaying areas in terms of square foot") + geom_histogram(aes(x = area), binwidth = 250, colour = "red ",fill = " pink")
ames %>%
summarize(mu = mean(area), pop_med = median(area),
sigma = sd(area), pop_iqr = IQR(area),
pop_min = min(area), pop_max = max(area),
pop_q1 = quantile(area, 0.25), # first quartile, 25th percentile
pop_q3 = quantile(area, 0.75)) # third quartile, 75th percentile
## # A tibble: 1 x 8
## mu pop_med sigma pop_iqr pop_min pop_max pop_q1 pop_q3
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1500. 1442 506. 617. 334 5642 1126 1743.
ggplot(ames, aes(area)) + ggtitle("Population distribution") + geom_histogram(aes(y = ..density..), alpha = 0.25, fill = "yellow", binwidth = 250) + stat_function(fun = dnorm, args = c(mean = mean(ames$area), sd = sd(ames$area)), col = "red")
## 3. Use sample_n to select a random sample of 50 houses from our data frame. Store the results in a new variable called samp1.
samp1 <- sample_n(ames,50)
samp1
## # A tibble: 50 x 82
## Order PID MS.SubClass MS.Zoning Lot.Frontage Lot.Area Street Alley
## <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <chr> <chr>
## 1 2559 534455080 20 RL 80 9600 Pave <NA>
## 2 2036 903456110 60 RM 60 9780 Pave Grvl
## 3 704 902134120 50 RM 56 10134 Pave Grvl
## 4 697 902105050 50 RM 90 15660 Pave <NA>
## 5 2903 921205050 20 RL 88 11577 Pave <NA>
## 6 2479 531450040 60 RL 65 7153 Pave <NA>
## 7 2758 906392090 20 RL 90 13377 Pave <NA>
## 8 731 903201080 50 RL 55 7264 Pave <NA>
## 9 2070 905226170 20 RL 85 13770 Pave <NA>
## 10 1706 528144050 60 RL 86 10562 Pave <NA>
## # ... with 40 more rows, and 74 more variables: Lot.Shape <chr>,
## # Land.Contour <chr>, Utilities <chr>, Lot.Config <chr>, Land.Slope <chr>,
## # Neighborhood <chr>, Condition.1 <chr>, Condition.2 <chr>, Bldg.Type <chr>,
## # House.Style <chr>, Overall.Qual <dbl>, Overall.Cond <dbl>,
## # Year.Built <dbl>, Year.Remod.Add <dbl>, Roof.Style <chr>, Roof.Matl <chr>,
## # Exterior.1st <chr>, Exterior.2nd <chr>, Mas.Vnr.Type <chr>,
## # Mas.Vnr.Area <dbl>, Exter.Qual <chr>, Exter.Cond <chr>, ...
ggplot(samp1) + ggtitle("Distribution of area") + geom_histogram(aes(x = area), binwidth = 250, colour= " yellow", fill = "purple")
samp1 %>%
summarize(mean_sample = mean(area), pop_med_sample = median(area),
sd_sample = sd(area), pop_iqr_sample = IQR(area),
pop_min_sample = min(area), pop_max_sample = max(area),
pop_q1_sample = quantile(area, 0.25), # first quartile, 25th percentile
pop_q3_sample = quantile(area, 0.75)) # third quartile, 75th percentile
## # A tibble: 1 x 8
## mean_sample pop_med_sample sd_sample pop_iqr_sample pop_min_sample
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1571. 1452. 659. 606. 498
## # ... with 3 more variables: pop_max_sample <dbl>, pop_q1_sample <dbl>,
## # pop_q3_sample <dbl>
##From the plot we can see that the visualization is right skewed and the values of distribution of population and distribution of area sample are almost same.
mean(samp1$area)
## [1] 1570.98
mean(ames$area)
## [1] 1499.69
## I would not expect the mean of my sample matches with anyone since I have given my student id as seed number . So I consider others wouldn’t randomly select that number as their mean. I also compared mine with Chaitanya P. He got different mean from mean value I got.
samp2<- sample_n(ames,50)
mean(samp2$area)
## [1] 1517.52
##The mean of samp1 is 1570.98 and the mean of samp2 is 1517.52. From this I can say the mean of samp1 is greater than samp2 mean value.
## 9. Suppose we took two more samples, one of size 100 and one of size 1000. Which would you think would provide a more accurate estimate of the population mean? Check your answer by taking the two samples and calculating the mean of each.
samp3<-sample_n(ames,100)
mean(samp3$area)
## [1] 1564.65
samp4<-sample_n(ames,1000)
mean(samp4$area)
## [1] 1499.725
##Since the mean value of sample 1000 is very near to actual sample mean, I suppose the highest sample gives the accurate result of the mean i.e., sample size 1000 displays the accurate mean.
sample_means50 <- tibble(sample_means =
replicate(15000,
mean(sample(ames$area, 50,replace = TRUE))))
sample_means50
## # A tibble: 15,000 x 1
## sample_means
## <dbl>
## 1 1572.
## 2 1603.
## 3 1387.
## 4 1421.
## 5 1543.
## 6 1477.
## 7 1527.
## 8 1551.
## 9 1542.
## 10 1486.
## # ... with 14,990 more rows
ggplot(sample_means50,aes(sample_means))+ ggtitle(" Histogram showing results stored in sample_means50")+geom_histogram(colour = "orange", fill = " pink")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## 11. How many elements are there in sample_means50? Describe the sampling distribution, and be sure to specifically note its center.
nrow(sample_means50)
## [1] 15000
sample_means50_sort<-sort(sample_means50$sample_means)
sample_means50_sort[nrow(sample_means50)/2]
## [1] 1498.1
## 1498.1 is the center of the sample.
##12. To make sure you understand how sampling distributions are built, try modifying the code to create a sampling distribution of 25 sample means from samples of size 10, and put them in a data frame named sample_means_small. Plot the results. How many observations are there in this object called sample_means_small? What does each observation represent?
sample_means_small <- tibble(sample_means =
replicate(25,
mean(sample(ames$area, 10,replace = TRUE))))
sample_means_small
## # A tibble: 25 x 1
## sample_means
## <dbl>
## 1 1483.
## 2 1236.
## 3 1501.
## 4 1493.
## 5 1386.
## 6 1387
## 7 1498.
## 8 1472
## 9 1746.
## 10 1426.
## # ... with 15 more rows
##There are 25 rows in sample_means_small. Each observation displays mean of 10 random observation samples.
## 13. Use the code below to create sampling distributions of means of areas from samples of size 10, 50, and 100. Use 5,000 simulations. What does each observation in the sampling distribution represent? How does the mean, standard error (i.e. the standard deviation of the sampling distribution), and shape of the sampling distribution change as the sample size increases? For a sample size of 30, does the shape of the distribution change if you increase the number of simulations from 50 to 1050 in steps of 250?
sample_means_10 <- tibble(sample_means_10 =
replicate(5000,
mean(sample(ames$area, 10,replace = TRUE))))
sample_means_50_new <- tibble(sample_means_50 =
replicate(5000,
mean(sample(ames$area, 50,replace = TRUE))))
sample_means_100 <- tibble(sample_means_100 =
replicate(5000,
mean(sample(ames$area, 100,replace = TRUE))))
hist(sample_means_10$sample_means_10, breaks = 25)
hist(sample_means_50_new$sample_means_50, breaks = 25)
hist(sample_means_100$sample_means_100, breaks = 25)
##From above 3 visualizations we can observe that with increasing sample size the mean is closer to mean area i.e.,with the sample size 100 the distribution shape has changed and is similar to normal distribution.
mean(sample_means_10$sample_means_10)
## [1] 1497.882
mean(sample_means_50_new$sample_means_50)
## [1] 1500.26
mean(sample_means_100$sample_means_100)
## [1] 1499.049
##The mean is increased with increasing sample size.
sd(sample_means_10$sample_means_10, na.rm = FALSE)
## [1] 157.7351
sd(sample_means_50_new$sample_means_50, na.rm = FALSE)
## [1] 71.55316
sd(sample_means_100$sample_means_100, na.rm = FALSE)
## [1] 51.21775
##The standard error is decreased with increasing sample size.
## If the sample size is 30
sample_means_30 <- tibble(sample_means_30 =
replicate(50,
mean(sample(ames$area, 30,replace = TRUE))))
hist(sample_means_30$sample_means_30, breaks = 25)
##Sample_means is 1050
sample_means_30_new <- tibble(sample_means_1050 =
replicate(1050,
mean(sample(ames$area, 30,replace = TRUE))))
hist(sample_means_30_new$sample_means_1050, breaks = 25)
##When the sample size is increased to 1050 we can see the distribution shape is changed.