Sampling Distribution

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.1.3

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.4     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.1.1     v forcats 0.5.1

## Warning: package 'ggplot2' was built under R version 4.1.3

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(here)

## Warning: package 'here' was built under R version 4.1.3

## here() starts at C:/Users/user/Documents/data

library(ggplot2)
library(dplyr)

ames <- read_csv(here('data', 'ames.csv'))

## Rows: 2930 Columns: 82

## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (43): MS.Zoning, Street, Alley, Lot.Shape, Land.Contour, Utilities, Lot....
## dbl (39): Order, PID, MS.SubClass, Lot.Frontage, Lot.Area, Overall.Qual, Ove...

## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

ames

## # A tibble: 2,930 x 82
##    Order       PID MS.SubClass MS.Zoning Lot.Frontage Lot.Area Street Alley
##    <dbl>     <dbl>       <dbl> <chr>            <dbl>    <dbl> <chr>  <chr>
##  1     1 526301100          20 RL                 141    31770 Pave   <NA> 
##  2     2 526350040          20 RH                  80    11622 Pave   <NA> 
##  3     3 526351010          20 RL                  81    14267 Pave   <NA> 
##  4     4 526353030          20 RL                  93    11160 Pave   <NA> 
##  5     5 527105010          60 RL                  74    13830 Pave   <NA> 
##  6     6 527105030          60 RL                  78     9978 Pave   <NA> 
##  7     7 527127150         120 RL                  41     4920 Pave   <NA> 
##  8     8 527145080         120 RL                  43     5005 Pave   <NA> 
##  9     9 527146030         120 RL                  39     5389 Pave   <NA> 
## 10    10 527162130          60 RL                  60     7500 Pave   <NA> 
## # ... with 2,920 more rows, and 74 more variables: Lot.Shape <chr>,
## #   Land.Contour <chr>, Utilities <chr>, Lot.Config <chr>, Land.Slope <chr>,
## #   Neighborhood <chr>, Condition.1 <chr>, Condition.2 <chr>, Bldg.Type <chr>,
## #   House.Style <chr>, Overall.Qual <dbl>, Overall.Cond <dbl>,
## #   Year.Built <dbl>, Year.Remod.Add <dbl>, Roof.Style <chr>, Roof.Matl <chr>,
## #   Exterior.1st <chr>, Exterior.2nd <chr>, Mas.Vnr.Type <chr>,
## #   Mas.Vnr.Area <dbl>, Exter.Qual <chr>, Exter.Cond <chr>, ...

Setting seed

set.seed(773952)

##1. Create a histogram of the areas variable (set binwidth = 250).

ggplot(ames) + ggtitle("Histogram displaying areas in terms of square foot") + geom_histogram(aes(x = area), binwidth = 250, colour = "red ",fill = " pink")

ames %>%
summarize(mu = mean(area), pop_med = median(area),
sigma = sd(area), pop_iqr = IQR(area),
pop_min = min(area), pop_max = max(area),
pop_q1 = quantile(area, 0.25), # first quartile, 25th percentile
pop_q3 = quantile(area, 0.75)) # third quartile, 75th percentile

## # A tibble: 1 x 8
##      mu pop_med sigma pop_iqr pop_min pop_max pop_q1 pop_q3
##   <dbl>   <dbl> <dbl>   <dbl>   <dbl>   <dbl>  <dbl>  <dbl>
## 1 1500.    1442  506.    617.     334    5642   1126  1743.

2. Describe this population distribution based on the visualization above and these summary statistics. You don’t have to use all of the summary statistics in your description, you will need to decide which ones are relevant based on the shape of the distribution.

ggplot(ames, aes(area)) + ggtitle("Population distribution") + geom_histogram(aes(y = ..density..), alpha = 0.25, fill = "yellow", binwidth = 250) + stat_function(fun = dnorm, args = c(mean = mean(ames$area), sd = sd(ames$area)), col = "red")

From the above visualization and summary statistics we can notice that the major data part lies in quartile 1. As we can see the visualization is right skewed. It is relevant to compare summarization with quartile 1 the value of quartile 1 is 1126 while the mean and iqr are 1499 and 616.75 respectively.

## 3. Use sample_n to select a random sample of 50 houses from our data frame. Store the results in a new variable called samp1.

samp1 <- sample_n(ames,50)

samp1

## # A tibble: 50 x 82
##    Order       PID MS.SubClass MS.Zoning Lot.Frontage Lot.Area Street Alley
##    <dbl>     <dbl>       <dbl> <chr>            <dbl>    <dbl> <chr>  <chr>
##  1  2559 534455080          20 RL                  80     9600 Pave   <NA> 
##  2  2036 903456110          60 RM                  60     9780 Pave   Grvl 
##  3   704 902134120          50 RM                  56    10134 Pave   Grvl 
##  4   697 902105050          50 RM                  90    15660 Pave   <NA> 
##  5  2903 921205050          20 RL                  88    11577 Pave   <NA> 
##  6  2479 531450040          60 RL                  65     7153 Pave   <NA> 
##  7  2758 906392090          20 RL                  90    13377 Pave   <NA> 
##  8   731 903201080          50 RL                  55     7264 Pave   <NA> 
##  9  2070 905226170          20 RL                  85    13770 Pave   <NA> 
## 10  1706 528144050          60 RL                  86    10562 Pave   <NA> 
## # ... with 40 more rows, and 74 more variables: Lot.Shape <chr>,
## #   Land.Contour <chr>, Utilities <chr>, Lot.Config <chr>, Land.Slope <chr>,
## #   Neighborhood <chr>, Condition.1 <chr>, Condition.2 <chr>, Bldg.Type <chr>,
## #   House.Style <chr>, Overall.Qual <dbl>, Overall.Cond <dbl>,
## #   Year.Built <dbl>, Year.Remod.Add <dbl>, Roof.Style <chr>, Roof.Matl <chr>,
## #   Exterior.1st <chr>, Exterior.2nd <chr>, Mas.Vnr.Type <chr>,
## #   Mas.Vnr.Area <dbl>, Exter.Qual <chr>, Exter.Cond <chr>, ...

4. Describe the distribution of area in this sample. How does it compare to the distribution of the population? Hint: the sample_n function takes a random sample of observations (i.e. rows) from the dataset, you can still refer to the variables in the dataset with the same names. Code you used in the previous exercise will also be helpful for visualizing and summarizing the sample, however be careful to not label values mu and sigma anymore since these are sample statistics, NOT population parameters. You can change the labels of any of the statistics to indicate that these come from the sample

 ggplot(samp1) + ggtitle("Distribution of area") + geom_histogram(aes(x = area), binwidth = 250, colour= " yellow", fill = "purple")

samp1 %>%
summarize(mean_sample = mean(area), pop_med_sample = median(area),
sd_sample = sd(area), pop_iqr_sample = IQR(area),
pop_min_sample = min(area), pop_max_sample = max(area),
pop_q1_sample = quantile(area, 0.25), # first quartile, 25th percentile
pop_q3_sample = quantile(area, 0.75)) # third quartile, 75th percentile

## # A tibble: 1 x 8
##   mean_sample pop_med_sample sd_sample pop_iqr_sample pop_min_sample
##         <dbl>          <dbl>     <dbl>          <dbl>          <dbl>
## 1       1571.          1452.      659.           606.            498
## # ... with 3 more variables: pop_max_sample <dbl>, pop_q1_sample <dbl>,
## #   pop_q3_sample <dbl>

##From the plot we can see that the visualization is right skewed and the values of distribution of population and distribution of area sample are almost same.

5. Calculate the mean area of the homes in this sample of 50.

mean(samp1$area)

## [1] 1570.98

6. Calculate the mean area of all the homes in our population.

mean(ames$area)

## [1] 1499.69

7. Would you expect the mean of your sample to match the mean of another classmate’s sample? Why, or why not? If the answer is no, would you expect the means to just be somewhat different or very different? Confirm your answer by comparing with a classmate.

## I would not expect the mean of my sample matches with anyone since I have given my student id as seed number . So I consider others wouldn’t randomly select that number as their mean. I also compared mine with Chaitanya P. He got different mean from mean value I got.

8. Take a second sample, also of size 50, and call it samp2. How does the mean of samp2 compare with the mean of samp1?

samp2<- sample_n(ames,50)
mean(samp2$area)

## [1] 1517.52

##The mean of samp1 is 1570.98 and the mean of samp2 is 1517.52. From this I can say the mean of samp1 is greater than samp2 mean value.
## 9. Suppose we took two more samples, one of size 100 and one of size 1000. Which would you think would provide a more accurate estimate of the population mean? Check your answer by taking the two samples and calculating the mean of each.

samp3<-sample_n(ames,100)
mean(samp3$area)

## [1] 1564.65

samp4<-sample_n(ames,1000)
mean(samp4$area)

## [1] 1499.725

##Since the mean value of sample 1000 is very near to actual sample mean, I suppose the highest sample gives the accurate result of the mean i.e., sample size 1000 displays the accurate mean.

sample_means50 <- tibble(sample_means =
replicate(15000,
mean(sample(ames$area, 50,replace = TRUE))))

sample_means50

## # A tibble: 15,000 x 1
##    sample_means
##           <dbl>
##  1        1572.
##  2        1603.
##  3        1387.
##  4        1421.
##  5        1543.
##  6        1477.
##  7        1527.
##  8        1551.
##  9        1542.
## 10        1486.
## # ... with 14,990 more rows

10. Create a histogram of the results stored in sample_means50.

ggplot(sample_means50,aes(sample_means))+ ggtitle(" Histogram showing results stored in sample_means50")+geom_histogram(colour = "orange", fill = " pink")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## 11. How many elements are there in sample_means50? Describe the sampling distribution, and be sure to specifically note its center.

nrow(sample_means50)

## [1] 15000

sample_means50_sort<-sort(sample_means50$sample_means)
sample_means50_sort[nrow(sample_means50)/2]

## [1] 1498.1

There are 1500 elements in sample_means50.The sampling distribution is a probability of statistics depending on random samples from a population.

## 1498.1 is the center of the sample.

##12. To make sure you understand how sampling distributions are built, try modifying the code to create a sampling distribution of 25 sample means from samples of size 10, and put them in a data frame named sample_means_small. Plot the results. How many observations are there in this object called sample_means_small? What does each observation represent?

sample_means_small <- tibble(sample_means =
replicate(25,
mean(sample(ames$area, 10,replace = TRUE))))

sample_means_small

## # A tibble: 25 x 1
##    sample_means
##           <dbl>
##  1        1483.
##  2        1236.
##  3        1501.
##  4        1493.
##  5        1386.
##  6        1387 
##  7        1498.
##  8        1472 
##  9        1746.
## 10        1426.
## # ... with 15 more rows

##There are 25 rows in sample_means_small. Each observation displays mean of 10 random observation samples.
## 13. Use the code below to create sampling distributions of means of areas from samples of size 10, 50, and 100. Use 5,000 simulations. What does each observation in the sampling distribution represent? How does the mean, standard error (i.e. the standard deviation of the sampling distribution), and shape of the sampling distribution change as the sample size increases? For a sample size of 30, does the shape of the distribution change if you increase the number of simulations from 50 to 1050 in steps of 250?

sample_means_10 <- tibble(sample_means_10 =
replicate(5000,
mean(sample(ames$area, 10,replace = TRUE))))



sample_means_50_new <- tibble(sample_means_50 =
replicate(5000,
mean(sample(ames$area, 50,replace = TRUE))))



sample_means_100 <- tibble(sample_means_100 =
replicate(5000,
mean(sample(ames$area, 100,replace = TRUE))))


hist(sample_means_10$sample_means_10, breaks = 25)

hist(sample_means_50_new$sample_means_50, breaks = 25)

hist(sample_means_100$sample_means_100, breaks = 25)

##From above 3 visualizations we can observe that with increasing sample size the mean is closer to mean area i.e.,with the sample size 100 the distribution shape has changed and is similar to normal distribution.

mean(sample_means_10$sample_means_10)

## [1] 1497.882

mean(sample_means_50_new$sample_means_50)

## [1] 1500.26

mean(sample_means_100$sample_means_100)

## [1] 1499.049

##The mean is increased with increasing sample size.

sd(sample_means_10$sample_means_10, na.rm = FALSE)

## [1] 157.7351

sd(sample_means_50_new$sample_means_50, na.rm = FALSE)

## [1] 71.55316

sd(sample_means_100$sample_means_100, na.rm = FALSE)

## [1] 51.21775

##The standard error is decreased with increasing sample size.
## If the sample size is 30

sample_means_30 <- tibble(sample_means_30 =
replicate(50,
mean(sample(ames$area, 30,replace = TRUE))))
hist(sample_means_30$sample_means_30, breaks = 25)

##Sample_means is 1050

sample_means_30_new <- tibble(sample_means_1050 =
replicate(1050,
mean(sample(ames$area, 30,replace = TRUE))))
hist(sample_means_30_new$sample_means_1050, breaks = 25)

##When the sample size is increased to 1050 we can see the distribution shape is changed.