221115_IC_10_Hypo_Testing

#install.packages("psych")

# Setup
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(psych)

## Warning: package 'psych' was built under R version 4.2.2

## 
## Attaching package: 'psych'
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

library(infer)
getwd()

## [1] "C:/Users/Jerome/Documents/0000_Work_Files/0000_Montgomery_College/Data_Science_101/Data_101_Fall_2022/Homework_10_Due_14Nov22"

### 1st Test - One Proportion Z test
prop.test(125, 500, 0.22, alternative = "greater")

## 
##  1-sample proportions test with continuity correction
## 
## data:  125 out of 500, null probability 0.22
## X-squared = 2.4505, df = 1, p-value = 0.05874
## alternative hypothesis: true p is greater than 0.22
## 95 percent confidence interval:
##  0.218598 1.000000
## sample estimates:
##    p 
## 0.25

### 2nd Test - Two Proportion Z Test
prop.test(c(231, 176), c(1000, 1200), alternative = "greater")

## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(231, 176) out of c(1000, 1200)
## X-squared = 25.173, df = 1, p-value = 2.621e-07
## alternative hypothesis: greater
## 95 percent confidence interval:
##  0.05579805 1.00000000
## sample estimates:
##    prop 1    prop 2 
## 0.2310000 0.1466667

### 3rd Test - One Mean t-test
euro_men_le <- read_csv("Euro_Men_LE.csv")

## Rows: 52 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (1): 73
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

t.test(euro_men_le, mu=79.8, conf.level = 0.99, alternative = "less")

## 
##  One Sample t-test
## 
## data:  euro_men_le
## t = -7.5437, df = 51, p-value = 3.777e-10
## alternative hypothesis: true mean is less than 79.8
## 99 percent confidence interval:
##      -Inf 75.67616
## sample estimates:
## mean of x 
##     73.75

### 4th Test - Independent 2 Sample t-test
cholesterol <- read_csv("cholesterol.csv")

## Rows: 30 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): Heart_Attack, Healthy
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

t.test(cholesterol$Heart_Attack, cholesterol$Healthy, conf.level = 0.99, alternative="greater")

## 
##  Welch Two Sample t-test
## 
## data:  cholesterol$Heart_Attack and cholesterol$Healthy
## t = 6.1452, df = 37.675, p-value = 1.86e-07
## alternative hypothesis: true difference in means is greater than 0
## 99 percent confidence interval:
##  36.7602     Inf
## sample estimates:
## mean of x mean of y 
##  253.9286  193.1333

### 4a Get cute w/ the histogram
cholesterol2 <- read_csv("cholesterol2.csv")

## Rows: 58 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Heart_Attack
## dbl (2): Cholestoral_Level, HA_N
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

p1 <- cholesterol2 %>%
ggplot( aes(x=Cholestoral_Level, colour = Heart_Attack)) +
    geom_histogram( binwidth=10) +
    ggtitle("Histogram of Cholesterol by Incidence of Heart Attack") 
p1

### The histogram shows that while some persons w/ low cholesterol levels had heart attacks, no persons with cholesterol levels above ~ 250 had heart attacks (in this sample).

table(cholesterol2$Heart_Attack, cholesterol2$Cholestoral_Level)

##                  
##                   142 160 162 164 166 170 176 178 182 184 186 188 196 198 200
##   Heart Attack      1   1   0   0   0   0   0   0   0   0   1   0   0   0   0
##   No Heart Attack   0   1   1   1   1   1   1   2   5   1   1   1   1   3   2
##                  
##                   204 206 210 212 218 220 224 226 230 232 234 236 238 242 244
##   Heart Attack      0   1   1   0   0   1   1   1   0   0   1   2   0   1   1
##   No Heart Attack   1   1   0   1   1   0   0   0   1   1   0   0   1   1   0
##                  
##                   266 270 272 276 278 280 282 288 294 310 318 360
##   Heart Attack      1   1   1   1   1   2   2   2   1   1   1   1
##   No Heart Attack   0   0   0   0   0   0   0   0   0   0   0   0

### In the histogram, I did not understand the green line on the abscissa. I thought it meant there was a negiligble number of persons w/ high cholesterol who had no heart attacks. To verify this,  I constructed this table. It appears R simply made the abscissa line green instead of black; no individuals who had cholesterol levels above 242 had heart attacks.

### 4b Boxplot
boxplot(cholesterol2$Cholestoral_Level ~ (cholesterol2$Heart_Attack), ylab = "Cholesterol Level", xlab = "Heart Attack vs. No Heart Attack", main = "Boxplot of Cholesterol Level by Heart Attack vs. No Heart Attack")

### 5th Test - Dependent Paired 2 Sample t-test
NZ_Helmets <- read_csv("NZ_Helmets.csv")

## Rows: 18 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): Cardboard, Metal
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

t.test(NZ_Helmets$Cardboard, NZ_Helmets$Metal, paired = TRUE)

## 
##  Paired t-test
## 
## data:  NZ_Helmets$Cardboard and NZ_Helmets$Metal
## t = 3.1854, df = 17, p-value = 0.005415
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  0.5440163 2.6782060
## sample estimates:
## mean difference 
##        1.611111

### My only question w/ this exercise is why n = 18 and not 30. I suppose because the t-test is robust to small sample sizes; in researching this just now, I learned Gosset's original paper used a n of 4. I guess 18 is sufficient.

### Try ANOVA

airbnb <- read_csv("AB_NYC_2019.csv")

## Rows: 48895 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (5): name, host_name, neighbourhood_group, neighbourhood, room_type
## dbl  (10): id, host_id, latitude, longitude, price, minimum_nights, number_o...
## date  (1): last_review
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

airbnb_anova <- aov(price ~ room_type, data = airbnb)
aov(price ~ room_type, data = airbnb)

## Call:
##    aov(formula = price ~ room_type, data = airbnb)
## 
## Terms:
##                  room_type  Residuals
## Sum of Squares   185024882 2634888908
## Deg. of Freedom          2      48892
## 
## Residual standard error: 232.1466
## Estimated effects may be unbalanced

summary(airbnb_anova)

##                Df    Sum Sq  Mean Sq F value Pr(>F)    
## room_type       2 1.850e+08 92512441    1717 <2e-16 ***
## Residuals   48892 2.635e+09    53892                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

TukeyHSD(airbnb_anova, conf.level=.95)

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = price ~ room_type, data = airbnb)
## 
## $room_type
##                                    diff        lwr         upr     p adj
## Private room-Entire home/apt -122.01327 -127.00423 -117.022319 0.0000000
## Shared room-Entire home/apt  -141.66666 -158.00204 -125.331281 0.0000000
## Shared room-Private room      -19.65339  -36.03793   -3.268839 0.0136779

### I got this far w/ ANOVA.  I'll deal w/ effect size later. That requires installing the effectsize package, i guess.

### Try chi-square
#### 1st w/ one categorical variable. It didn't work w/ room_type. Said it didn't want a categorical variable. That I don't understand at all. So i tried a numeric variable and it worked just fine. Go figure. 
#### I spent enough  time on this. Time to quit. 
chisq.test(airbnb$price)

## 
##  Chi-squared test for given probabilities
## 
## data:  airbnb$price
## X-squared = 18464517, df = 48894, p-value < 2.2e-16

221115_IC_10_Hypo_Testing_rja

Jerome

2022-11-15