#install.packages("BSDA")
library(BSDA)
## Loading required package: lattice
##
## Attaching package: 'BSDA'
## The following object is masked from 'package:datasets':
##
## Orange
# NUMBER 1: Using weather data from NYC on 1/15/25 (high temp that day)
sample1 <- 31
# NUMBER 2: Adding two days to each sample (day before and day after the first and last days in each sample, then calculating the sd and mean of each sample)
sample2 <- c(33, 31, 29)
mean2 <- mean(sample2)
sd2 <- sd(sample2)
sample3 <- c(42, 33, 31, 29, 42)
mean3 <- mean(sample3)
sd3 <- sd(sample3)
sample4 <- c(41, 42, 33, 31, 29, 42, 46)
mean4 <- mean(sample4)
sd4 <- sd(sample4)
sample5 <- c(36, 41, 42, 33, 31, 29, 42, 46, 40)
mean5 <- mean(sample5)
sd5 <- sd(sample5)
sample6 <- c(39, 36, 41, 42, 33, 31, 29, 42, 46, 40, 26)
mean6 <- mean(sample6)
sd6 <- sd(sample6)
sample7 <- c(33, 39, 36, 41, 42, 33, 31, 29, 42, 46, 40, 26, 19)
mean7 <- mean(sample7)
sd7 <- sd(sample7)
sample8 <- c(29, 33, 39, 36, 41, 42, 33, 31, 29, 42, 46, 40, 26, 19, 22)
mean8 <- mean(sample8)
sd8 <- sd(sample8)
sample9 <- c(32, 29, 33, 39, 36, 41, 42, 33, 31, 29, 42, 46, 40, 26, 19, 22, 29)
mean9 <- mean(sample9)
sd9 <- sd(sample9)
sample10 <- c(30, 32, 29, 33, 39, 36, 41, 42, 33, 31, 29, 42, 46, 40, 26, 19, 22, 29, 34)
mean10 <- mean(sample10)
sd10 <- sd(sample10)
sample11 <- c(33, 30, 32, 29, 33, 39, 36, 41, 42, 33, 31, 29, 42, 46, 40, 26, 19, 22, 29, 34, 32)
mean11 <- mean(sample11)
sd11 <- sd(sample11)
sample12 <- c(32, 33, 30, 32, 29, 33, 39, 36, 41, 42, 33, 31, 29, 42, 46, 40, 26, 19, 22, 29, 34, 32, 43)
mean12 <- mean(sample12)
sd12 <- sd(sample12)
sample13 <- c(38, 32, 33, 30, 32, 29, 33, 39, 36, 41, 42, 33, 31, 29, 42, 46, 40, 26, 19, 22, 29, 34, 32, 43, 41)
mean13 <- mean(sample13)
sd13 <- sd(sample13)
sample14 <- c(42, 38, 32, 33, 30, 32, 29, 33, 39, 36, 41, 42, 33, 31, 29, 42, 46, 40, 26, 19, 22, 29, 34, 32, 43, 41, 41)
mean14 <- mean(sample14)
sd14 <- sd(sample14)
sample15 <- c(50, 42, 38, 32, 33, 30, 32, 29, 33, 39, 36, 41, 42, 33, 31, 29, 42, 46, 40, 26, 19, 22, 29, 34, 32, 43, 41, 41, 53)
mean15 <- mean(sample15)
sd15 <- sd(sample15)
sample16 <- c(52, 50, 42, 38, 32, 33, 30, 32, 29, 33, 39, 36, 41, 42, 33, 31, 29, 42, 46, 40, 26, 19, 22, 29, 34, 32, 43, 41, 41, 53, 37)
mean16 <- mean(sample16)
sd16 <- sd(sample16)
sample17 <- c(57, 52, 50, 42, 38, 32, 33, 30, 32, 29, 33, 39, 36, 41, 42, 33, 31, 29, 42, 46, 40, 26, 19, 22, 29, 34, 32, 43, 41, 41, 53, 37, 44)
mean17 <- mean(sample17)
sd17 <- sd(sample17)
sample18 <- c(68, 57, 52, 50, 42, 38, 32, 33, 30, 32, 29, 33, 39, 36, 41, 42, 33, 31, 29, 42, 46, 40, 26, 19, 22, 29, 34, 32, 43, 41, 41, 53, 37, 44, 38)
mean18 <- mean(sample18)
sd18 <- sd(sample18)
sample19 <- c(53, 68, 57, 52, 50, 42, 38, 32, 33, 30, 32, 29, 33, 39, 36, 41, 42, 33, 31, 29, 42, 46, 40, 26, 19, 22, 29, 34, 32, 43, 41, 41, 53, 37, 44, 38, 31)
mean19 <- mean(sample19)
sd19 <- sd(sample19)
sample20 <- c(51, 53, 68, 57, 52, 50, 42, 38, 32, 33, 30, 32, 29, 33, 39, 36, 41, 42, 33, 31, 29, 42, 46, 40, 26, 19, 22, 29, 34, 32, 43, 41, 41, 53, 37, 44, 38, 31, 34)
mean20 <- mean(sample20)
sd20 <- sd(sample20)
# NUMBER 3: Plotting means and standard deviations and finding their means + analysis
means <- c(sample1, mean2, mean3, mean4, mean5, mean6, mean7, mean8, mean9, mean10, mean11, mean12, mean13, mean14, mean15, mean16, mean17, mean18, mean19, mean20)
hist(means,
main = "Distribution of Sample Means",
xlab = "Sample Means",
col = "blue",
border = "black",
breaks = 10)

sds <- c(sd2, sd3, sd4, sd5, sd6, sd7, sd8, sd9, sd10, sd11, sd12, sd13, sd14, sd15, sd16, sd17, sd18, sd19, sd20)
hist(sds,
main = "Distribution of Sample Standard Deviations",
xlab = "Sample Standard Deviations",
col = "red",
border = "black",
breaks = 10)

mean_of_means <- mean(means) #determines the mean of means
mean_of_means
## [1] 35.27053
mean_of_sds <- mean(sds) #determines the mean of means
mean_of_sds
## [1] 7.297509
#The average of all of the means (mean_of_means) was about 35, which makes sense according to the graph, as the highest frequency of means was in the 33-36 range. This indicates that as the sample size increased and more samples were added to the overall distribution, the mean roughly stabilized to be around 35 degrees. Similarly with the standard deviations, we see that there was initially a good amount of variability between the standard deviation of each sample, but as the sample sizes increased, the standard deviation generally landed around 7 (also the mean_of_sds). This stabilization of both the means and the standard deviations likely tells us that it is beter to have a larger sample size since this will provide more reliable estimates for the mean and sd!
# NUMBER 4: Conducting z-test on sample of data from a month earlier (note: I decided to conduct the sample on data from a month earlier instead of later because the data has been from this past month January/February 2025, and I cannot predict what the weather will be for the next month) + analysis
month_earlier_sample <- c(38, 41, 41, 40, 34, 40, 54, 51, 50, 60, 41, 32, 33, 41, 52, 58, 53, 44, 37, 32)
sd_y <- sd(month_earlier_sample)
sd_y
## [1] 8.708133
z.test(means, month_earlier_sample, mu = 0, sigma.x = mean_of_sds, sigma.y = sd_y, conf.level = 0.95)
##
## Two-sample z-Test
##
## data: means and month_earlier_sample
## z = -3.2786, p-value = 0.001043
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -13.308800 -3.350131
## sample estimates:
## mean of x mean of y
## 35.27053 43.60000
# The two data sets show a statistically significant difference because of the low p-value (0.001043, which is less than 0.05). This means that (with 95% confidence) we reject the null hypothesis, which states that the two months have the same mean temperatures. Therefore, the new data *does* differ from the larger dataset. I would be inclined to conclude that the new data is not part of the original dataset because of the small p-value which indicates that it is unlikely that the current month's temperatures came from the same distribution as the earlier moth's data. External factors could have led to changes between the two months (particularly seasonal changes).