#install.packages("psych")
# Setup
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(psych)
## Warning: package 'psych' was built under R version 4.2.2
##
## Attaching package: 'psych'
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(infer)
getwd()
## [1] "C:/Users/Jerome/Documents/0000_Work_Files/0000_Montgomery_College/Data_Science_101/Data_101_Fall_2022/Homework_10_Due_14Nov22"
### 1st Test - One Proportion Z test
prop.test(125, 500, 0.22, alternative = "greater")
##
## 1-sample proportions test with continuity correction
##
## data: 125 out of 500, null probability 0.22
## X-squared = 2.4505, df = 1, p-value = 0.05874
## alternative hypothesis: true p is greater than 0.22
## 95 percent confidence interval:
## 0.218598 1.000000
## sample estimates:
## p
## 0.25
### 2nd Test - Two Proportion Z Test
prop.test(c(231, 176), c(1000, 1200), alternative = "greater")
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(231, 176) out of c(1000, 1200)
## X-squared = 25.173, df = 1, p-value = 2.621e-07
## alternative hypothesis: greater
## 95 percent confidence interval:
## 0.05579805 1.00000000
## sample estimates:
## prop 1 prop 2
## 0.2310000 0.1466667
### 3rd Test - One Mean t-test
euro_men_le <- read_csv("Euro_Men_LE.csv")
## Rows: 52 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (1): 73
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
t.test(euro_men_le, mu=79.8, conf.level = 0.99, alternative = "less")
##
## One Sample t-test
##
## data: euro_men_le
## t = -7.5437, df = 51, p-value = 3.777e-10
## alternative hypothesis: true mean is less than 79.8
## 99 percent confidence interval:
## -Inf 75.67616
## sample estimates:
## mean of x
## 73.75
### 4th Test - Independent 2 Sample t-test
cholesterol <- read_csv("cholesterol.csv")
## Rows: 30 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): Heart_Attack, Healthy
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
t.test(cholesterol$Heart_Attack, cholesterol$Healthy, conf.level = 0.99, alternative="greater")
##
## Welch Two Sample t-test
##
## data: cholesterol$Heart_Attack and cholesterol$Healthy
## t = 6.1452, df = 37.675, p-value = 1.86e-07
## alternative hypothesis: true difference in means is greater than 0
## 99 percent confidence interval:
## 36.7602 Inf
## sample estimates:
## mean of x mean of y
## 253.9286 193.1333
### 4a Get cute w/ the histogram
cholesterol2 <- read_csv("cholesterol2.csv")
## Rows: 58 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Heart_Attack
## dbl (2): Cholestoral_Level, HA_N
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
p1 <- cholesterol2 %>%
ggplot( aes(x=Cholestoral_Level, colour = Heart_Attack)) +
geom_histogram( binwidth=10) +
ggtitle("Histogram of Cholesterol by Incidence of Heart Attack")
p1

### The histogram shows that while some persons w/ low cholesterol levels had heart attacks, no persons with cholesterol levels above ~ 250 had heart attacks (in this sample).
table(cholesterol2$Heart_Attack, cholesterol2$Cholestoral_Level)
##
## 142 160 162 164 166 170 176 178 182 184 186 188 196 198 200
## Heart Attack 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0
## No Heart Attack 0 1 1 1 1 1 1 2 5 1 1 1 1 3 2
##
## 204 206 210 212 218 220 224 226 230 232 234 236 238 242 244
## Heart Attack 0 1 1 0 0 1 1 1 0 0 1 2 0 1 1
## No Heart Attack 1 1 0 1 1 0 0 0 1 1 0 0 1 1 0
##
## 266 270 272 276 278 280 282 288 294 310 318 360
## Heart Attack 1 1 1 1 1 2 2 2 1 1 1 1
## No Heart Attack 0 0 0 0 0 0 0 0 0 0 0 0
### In the histogram, I did not understand the green line on the abscissa. I thought it meant there was a negiligble number of persons w/ high cholesterol who had no heart attacks. To verify this, I constructed this table. It appears R simply made the abscissa line green instead of black; no individuals who had cholesterol levels above 242 had heart attacks.
### 4b Boxplot
boxplot(cholesterol2$Cholestoral_Level ~ (cholesterol2$Heart_Attack), ylab = "Cholesterol Level", xlab = "Heart Attack vs. No Heart Attack", main = "Boxplot of Cholesterol Level by Heart Attack vs. No Heart Attack")

### 5th Test - Dependent Paired 2 Sample t-test
NZ_Helmets <- read_csv("NZ_Helmets.csv")
## Rows: 18 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): Cardboard, Metal
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
t.test(NZ_Helmets$Cardboard, NZ_Helmets$Metal, paired = TRUE)
##
## Paired t-test
##
## data: NZ_Helmets$Cardboard and NZ_Helmets$Metal
## t = 3.1854, df = 17, p-value = 0.005415
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## 0.5440163 2.6782060
## sample estimates:
## mean difference
## 1.611111
### My only question w/ this exercise is why n = 18 and not 30. I suppose because the t-test is robust to small sample sizes; in researching this just now, I learned Gosset's original paper used a n of 4. I guess 18 is sufficient.
### Try ANOVA
airbnb <- read_csv("AB_NYC_2019.csv")
## Rows: 48895 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): name, host_name, neighbourhood_group, neighbourhood, room_type
## dbl (10): id, host_id, latitude, longitude, price, minimum_nights, number_o...
## date (1): last_review
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
airbnb_anova <- aov(price ~ room_type, data = airbnb)
aov(price ~ room_type, data = airbnb)
## Call:
## aov(formula = price ~ room_type, data = airbnb)
##
## Terms:
## room_type Residuals
## Sum of Squares 185024882 2634888908
## Deg. of Freedom 2 48892
##
## Residual standard error: 232.1466
## Estimated effects may be unbalanced
summary(airbnb_anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## room_type 2 1.850e+08 92512441 1717 <2e-16 ***
## Residuals 48892 2.635e+09 53892
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(airbnb_anova, conf.level=.95)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = price ~ room_type, data = airbnb)
##
## $room_type
## diff lwr upr p adj
## Private room-Entire home/apt -122.01327 -127.00423 -117.022319 0.0000000
## Shared room-Entire home/apt -141.66666 -158.00204 -125.331281 0.0000000
## Shared room-Private room -19.65339 -36.03793 -3.268839 0.0136779
### I got this far w/ ANOVA. I'll deal w/ effect size later. That requires installing the effectsize package, i guess.
### Try chi-square
#### 1st w/ one categorical variable. It didn't work w/ room_type. Said it didn't want a categorical variable. That I don't understand at all. So i tried a numeric variable and it worked just fine. Go figure.
#### I spent enough time on this. Time to quit.
chisq.test(airbnb$price)
##
## Chi-squared test for given probabilities
##
## data: airbnb$price
## X-squared = 18464517, df = 48894, p-value < 2.2e-16