Assignment 1 510

library(readxl)
library(moments)

## Warning: package 'moments' was built under R version 4.1.3

library(car)

## Warning: package 'car' was built under R version 4.1.3

## Loading required package: carData

## Warning: package 'carData' was built under R version 4.1.3

setwd("C:/Users/rabdo/OneDrive/Desktop/HU/510 51")

#get data for kid calories
kidscalories <- read.csv("kidscalories.csv",  header = TRUE)

#factor categorical data 
kidscalories$helpedinprep <- factor(kidscalories$helpedinprep)

#Research question
#Do children helping making dinner end up eating eating more?

#Hypothesis testing
#H0: Children who help cooking eat the same or less amount than those who don't
#Ha: children who help cooking eat more

#test for skewness
agostino.test(kidscalories$calorieintake)

## 
##  D'Agostino skewness test
## 
## data:  kidscalories$calorieintake
## skew = -0.011821, z = -0.037082, p-value = 0.9704
## alternative hypothesis: data have a skewness

#test for normality
shapiro.test(kidscalories$calorieintake)

## 
##  Shapiro-Wilk normality test
## 
## data:  kidscalories$calorieintake
## W = 0.97936, p-value = 0.5663

hist(kidscalories$calorieintake)

#data seems normal and not skewed as shown by low agostino skewness scores (-0.01) P-value for Shapiro test greater than .05, meaning data is not significantly different from normal.

#check for variance equality
leveneTest(kidscalories$calorieintake, kidscalories$helpedinprep)

## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value Pr(>F)
## group  1  0.0264 0.8716
##       45

#test with equal variance assumptions (fron leveneTest results)
t.test(calorieintake~helpedinprep, data = kidscalories, var.equal = TRUE,alternative='greater')

## 
##  Two Sample t-test
## 
## data:  calorieintake by helpedinprep
## t = 2.8137, df = 45, p-value = 0.003618
## alternative hypothesis: true difference in means between group 1 and group 2 is greater than 0
## 95 percent confidence interval:
##  34.10513      Inf
## sample estimates:
## mean in group 1 mean in group 2 
##        431.3996        346.7991

tapply(kidscalories$calorieintake, kidscalories$helpedinprep, sd)

##         1         2 
## 105.70124  99.50114

#SUMMARY
#In the current study, we examined the difference in the amount of food children eat between those who helped cooking and the others.Performing an independent t-test (equal variances assumed) we find there is significant difference between them (M = 431.3996; SD = 105.70124) (M = 346.7991; SD = 99.50114), t(45) = 2.8137, p = 0.003618.


#get data for cholestoral
CholestoralData <- read.csv("CholestoralData.csv")

#Research question
#Does consuming one brand of margarine help lower cholesterol level more than the other?

#Hypothesis testing
#H0: There is no difference in cholesterol level when using brand A or B margarine?
#Ha: One brand helps lower cholesterol level

#factor categorical data 
CholestoralData$Margarine <- factor(CholestoralData$Margarine)

#new column for difference between before and after:
CholestoralData$difference <- CholestoralData$After - CholestoralData$Before

#test variance equality
leveneTest(CholestoralData$difference, CholestoralData$Margarine)

## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value    Pr(>F)    
## group  1  15.478 0.0003431 ***
##       38                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#test with unequal variance assumptions (fron leveneTest results)
t.test(difference ~ Margarine, data = CholestoralData, var.equal = FALSE)

## 
##  Welch Two Sample t-test
## 
## data:  difference by Margarine
## t = -3.9902, df = 19.854, p-value = 0.0007285
## alternative hypothesis: true difference in means between group A and group B is not equal to 0
## 95 percent confidence interval:
##  -5.281831 -1.654169
## sample estimates:
## mean in group A mean in group B 
##         -3.7805         -0.3125

#non-parametric test
wilcox.test(CholestoralData$difference~CholestoralData$Margarine)

## 
##  Wilcoxon rank sum exact test
## 
## data:  CholestoralData$difference by CholestoralData$Margarine
## W = 86, p-value = 0.001593
## alternative hypothesis: true location shift is not equal to 0

t.test(CholestoralData$Before[CholestoralData$Margarine == "A"], 
       CholestoralData$After[CholestoralData$Margarine == "A"], paired = TRUE)

## 
##  Paired t-test
## 
## data:  CholestoralData$Before[CholestoralData$Margarine == "A"] and CholestoralData$After[CholestoralData$Margarine == "A"]
## t = 4.3984, df = 19, p-value = 0.0003089
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  1.981502 5.579498
## sample estimates:
## mean of the differences 
##                  3.7805

#mean
tapply(CholestoralData$difference, CholestoralData$Margarine, mean)

##       A       B 
## -3.7805 -0.3125

#standard deviation
tapply(CholestoralData$difference, CholestoralData$Margarine, sd)

##         A         B 
## 3.8438953 0.5764125

#SUMMARY
#When comparing the difference in cholesterol levels for individuals eating margarine A or Busing an independent-test (for unequal variances), we find margarine A (M  -3.78; SD = 3.84)led to a greater reduction than B (M = -.31; SD = .58), t(19.85) = -3.99, p < .001. A follow up paired t-test to see if those in the A group showed a significant reduction. (Mean difference with 95% confidence interval [1.98, 5.58]), t(19) = 4.39, p < .001. We can conclude that margarine A appears to lower cholesterol levels significantly more than B.

#get data for priorities 
PrioritiesData <- read.csv("PrioritiesData.csv",  header = TRUE)

#Research question
#Do school priorities differ due to their location (rural, suburban or urban)?

#Hypothesis testing
#H0: Proportion of kids priorities are equal across locations
#Ha: Proportion of kids priorities differ across locations

#Chi Square test of independence

#look at proportions
table(PrioritiesData$Rural/(sum(PrioritiesData$Rural)))

## 
## 0.28 0.34 0.38 
##    1    1    1

table(PrioritiesData$Suburban/(sum(PrioritiesData$Suburban)))

## 
## 0.14 0.28 0.58 
##    1    1    1

table(PrioritiesData$Urban/(sum(PrioritiesData$Urban)))

## 
## 0.14 0.17 0.69 
##    1    1    1

chisq.test(PrioritiesData$Rural)

## 
##  Chi-squared test for given probabilities
## 
## data:  PrioritiesData$Rural
## X-squared = 1.52, df = 2, p-value = 0.4677

chisq.test(PrioritiesData$Suburban)

## 
##  Chi-squared test for given probabilities
## 
## data:  PrioritiesData$Suburban
## X-squared = 30.32, df = 2, p-value = 2.607e-07

chisq.test(PrioritiesData$Urban)

## 
##  Chi-squared test for given probabilities
## 
## data:  PrioritiesData$Urban
## X-squared = 57.38, df = 2, p-value = 3.468e-13

chisq.test(PrioritiesData[, 2:3])

## 
##  Pearson's Chi-squared test
## 
## data:  PrioritiesData[, 2:3]
## X-squared = 9.414, df = 2, p-value = 0.009032

#SUMMARY
#When looking if students’ school priorities differ across locations (rural, suburban or urban), we performed a Chi Square test of independence. The test revealed a significant difference in the proportion of kid’s priorities across communities. A closer examination revealed that for kids in rural priorities are fairly equal across (38%, 28%, and 34%). For suburban (58%, 14%, 28%) and urban (69%, 14%, and 17%), priorities differ significantly.

#get data for VotingData 
VotingData <- read.csv("VotingData.csv",  header = TRUE)

DRL <- c(63,310,5)
RLD <- c(69,302,5)
LDR <- c(61,308,5)

#Research question
#Does listing order impacts voting?

#Hypothesis testing
#H0: There is no difference in voting across listings
#Ha: There is a difference in voting across listings

#Chi Square test of independence
df<-data.frame(DRL,RLD,LDR)
chisq.test(df)

## Warning in chisq.test(df): Chi-squared approximation may be incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  df
## X-squared = 0.63124, df = 4, p-value = 0.9595

#proportions
table(df$DRL/(sum(df$DRL)))

## 
## 0.0132275132275132  0.166666666666667   0.82010582010582 
##                  1                  1                  1

table(df$LDR/(sum(df$LDR)))

## 
## 0.0133689839572193  0.163101604278075  0.823529411764706 
##                  1                  1                  1

table(df$RLD/(sum(df$RLD)))

## 
## 0.0132978723404255  0.183510638297872  0.803191489361702 
##                  1                  1                  1

#SUMMARY
#We checked whether the order candidates are listed influenced votes a candidate received. Chi-Square test of independence revealed no significant difference in the proportion of votes for different candidates by candidate ordering Chi-square = 0.63124, df=4, p-value = 0.9595 Republicans votes are the highest (82%), then democrats (17%), and libertarians (1%).

Assignment 1 510

Rabah Douadi

2023-01-21