#HOMEWORK 7

  1. Select your favorite dependent variable from your dataset, and ensure that it meets the criteria for parametric tests (normal distribution, continuous, etc.). If it does not, transform it using the log or sqrt transformation. Demonstrate that you have done this, if needed. Run a histogram to reveal the distribution of your dependent variable.

  2. run an anova “aov()” this variable, including some kind of grouping variable (dependent~group,data=yourdata)

  3. save the resulting aov as an object in the homework

  4. run “summary()” on the saved object. What do you see? Explain the results to me.

  5. Run the same aov, but this time as a linear model “lm()”. Save the linear model, and run summary() on it, just like you did with the aov(). What do you see? Explain the results to me. Are they similar? Different? Does the linear model explain the data more than the aov()?

  6. Knit and submit via CANVAS

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(ggplot2)
library(dplyr)

#1
project_data <- read_excel("texas federal funds.xlsx")

older_data <- project_data[, c(1, 236, 436, 243, 739)]
# =Column(PT1) ms excel formula
# 236 = ENVIRONMENTAL HEALTH // 436 = MATERNAL AND CHILD HEALTH SERVICES BLOCK GRANT TO THE STATES // 243 = EVEN START - STATE EDUCATIONAL AGENCIES // 739 = STATE ADMINISTRATIVE EXPENSES FOR CHILD NUTRITION

colnames(older_data) <- c("Time", "Environmental", "MaternalChild", "EvenStart", "ChildNutrition") 

older_data[,2:4] <- lapply(older_data[,2:4], as.numeric) 


#removing NA values
older_data <- na.omit(older_data) 

older_data$Environmental <- as.numeric(as.character(older_data$Environmental)) 

older_data$MaternalChild <- as.numeric(as.character(older_data$MaternalChild)) 

older_data$EvenStart <- as.numeric(as.character(older_data$EvenStart)) 

older_data$ChildNutrition <- as.numeric(as.character(older_data$ChildNutrition)) 
#1
CNdata_log<-older_data %>% mutate(LOG_CN=log(ChildNutrition)) %>% select(Time,LOG_CN)

head(CNdata_log)
## # A tibble: 6 × 2
##   Time  LOG_CN
##   <chr>  <dbl>
## 1 1996    16.0
## 2 1998    16.0
## 3 1999    16.4
## 4 2000    16.2
## 5 2001    16.3
## 6 2002    16.2
hist(CNdata_log$LOG_CN,breaks=10,probability = T)

#checked other dependent variables listed above (Environmental, MaternalChild, EvenStart) - ChildNutrition created the most normally distributed graph 


ENVdata_log<-older_data %>% mutate(LOG_ENV=log(Environmental)) %>% select(Time,LOG_ENV)
#will be used for homework assignment as additional variable


project_data_log<-older_data %>% mutate(LOG_CN=log(ChildNutrition)) %>% mutate(LOG_ENV=log(Environmental)) %>% select(Time,LOG_CN,LOG_ENV)
#dataset for assignment to compare variables
#2 #3
data_anova<-aov(LOG_ENV~LOG_CN,data=project_data_log)

#4
summary(data_anova)
##             Df Sum Sq Mean Sq F value Pr(>F)
## LOG_CN       1  1.215  1.2149   2.469  0.144
## Residuals   11  5.414  0.4921

From the summary of the anova test, we can see a p-value for the ChildNutrition variable does not qualify for statistical significance (0.1444). Therefore, we can assume that there is no significant difference of its mean value when compared against the Environmental variable and our categorical variable (Time); in essence, proving the null hypothesis. This result is not entirely surprising, as previous homework assignments have also supplied evidence towards the null hypothesis.

#5
data_lm_anova<-lm(LOG_ENV~LOG_CN,data=project_data_log)
summary(data_lm_anova)
## 
## Call:
## lm(formula = LOG_ENV ~ LOG_CN, data = project_data_log)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.0765 -0.5043  0.2787  0.4196  1.0364 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  -4.8769    12.3148  -0.396    0.700
## LOG_CN        1.1871     0.7556   1.571    0.144
## 
## Residual standard error: 0.7015 on 11 degrees of freedom
## Multiple R-squared:  0.1833, Adjusted R-squared:  0.109 
## F-statistic: 2.469 on 1 and 11 DF,  p-value: 0.1444

Additional data provided by the summary command includes the r-squared value (0.109), suggesting a poor fit against the regression model.

data_la_tukey<-lm(LOG_ENV~as.factor(LOG_CN),data=project_data_log)
#as.factor command added so that TukeyHSD command would function properly

TukeyHSD(aov(data_la_tukey))
## Warning in qtukey(conf.level, length(means), x$df.residual): NaNs produced
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = data_la_tukey)
## 
## $`as.factor(LOG_CN)`
##                                          diff lwr upr p adj
## 16.0388383514377-15.975546808783   1.61582090 NaN NaN   NaN
## 16.0491078866012-15.975546808783  -0.48485165 NaN NaN   NaN
## 16.1601800483127-15.975546808783  -0.06114139 NaN NaN   NaN
## 16.191878684611-15.975546808783    1.35438756 NaN NaN   NaN
## 16.1954831752117-15.975546808783   1.22534977 NaN NaN   NaN
## 16.2497435459142-15.975546808783   1.14653696 NaN NaN   NaN
## 16.2591032519611-15.975546808783   1.09319995 NaN NaN   NaN
## 16.2952384879678-15.975546808783   1.16246402 NaN NaN   NaN
## 16.3678384448687-15.975546808783  -0.01647817 NaN NaN   NaN
## 16.4811117472266-15.975546808783   1.52408146 NaN NaN   NaN
## 16.62827488586-15.975546808783     1.69877357 NaN NaN   NaN
## 16.9589640580649-15.975546808783   1.24717941 NaN NaN   NaN
## 16.0491078866012-16.0388383514377 -2.10067254 NaN NaN   NaN
## 16.1601800483127-16.0388383514377 -1.67696228 NaN NaN   NaN
## 16.191878684611-16.0388383514377  -0.26143333 NaN NaN   NaN
## 16.1954831752117-16.0388383514377 -0.39047113 NaN NaN   NaN
## 16.2497435459142-16.0388383514377 -0.46928393 NaN NaN   NaN
## 16.2591032519611-16.0388383514377 -0.52262095 NaN NaN   NaN
## 16.2952384879678-16.0388383514377 -0.45335688 NaN NaN   NaN
## 16.3678384448687-16.0388383514377 -1.63229907 NaN NaN   NaN
## 16.4811117472266-16.0388383514377 -0.09173944 NaN NaN   NaN
## 16.62827488586-16.0388383514377    0.08295267 NaN NaN   NaN
## 16.9589640580649-16.0388383514377 -0.36864149 NaN NaN   NaN
## 16.1601800483127-16.0491078866012  0.42371026 NaN NaN   NaN
## 16.191878684611-16.0491078866012   1.83923921 NaN NaN   NaN
## 16.1954831752117-16.0491078866012  1.71020141 NaN NaN   NaN
## 16.2497435459142-16.0491078866012  1.63138861 NaN NaN   NaN
## 16.2591032519611-16.0491078866012  1.57805160 NaN NaN   NaN
## 16.2952384879678-16.0491078866012  1.64731566 NaN NaN   NaN
## 16.3678384448687-16.0491078866012  0.46837348 NaN NaN   NaN
## 16.4811117472266-16.0491078866012  2.00893310 NaN NaN   NaN
## 16.62827488586-16.0491078866012    2.18362521 NaN NaN   NaN
## 16.9589640580649-16.0491078866012  1.73203105 NaN NaN   NaN
## 16.191878684611-16.1601800483127   1.41552895 NaN NaN   NaN
## 16.1954831752117-16.1601800483127  1.28649116 NaN NaN   NaN
## 16.2497435459142-16.1601800483127  1.20767835 NaN NaN   NaN
## 16.2591032519611-16.1601800483127  1.15434134 NaN NaN   NaN
## 16.2952384879678-16.1601800483127  1.22360540 NaN NaN   NaN
## 16.3678384448687-16.1601800483127  0.04466322 NaN NaN   NaN
## 16.4811117472266-16.1601800483127  1.58522285 NaN NaN   NaN
## 16.62827488586-16.1601800483127    1.75991495 NaN NaN   NaN
## 16.9589640580649-16.1601800483127  1.30832080 NaN NaN   NaN
## 16.1954831752117-16.191878684611  -0.12903780 NaN NaN   NaN
## 16.2497435459142-16.191878684611  -0.20785060 NaN NaN   NaN
## 16.2591032519611-16.191878684611  -0.26118761 NaN NaN   NaN
## 16.2952384879678-16.191878684611  -0.19192355 NaN NaN   NaN
## 16.3678384448687-16.191878684611  -1.37086573 NaN NaN   NaN
## 16.4811117472266-16.191878684611   0.16969389 NaN NaN   NaN
## 16.62827488586-16.191878684611     0.34438600 NaN NaN   NaN
## 16.9589640580649-16.191878684611  -0.10720815 NaN NaN   NaN
## 16.2497435459142-16.1954831752117 -0.07881281 NaN NaN   NaN
## 16.2591032519611-16.1954831752117 -0.13214982 NaN NaN   NaN
## 16.2952384879678-16.1954831752117 -0.06288575 NaN NaN   NaN
## 16.3678384448687-16.1954831752117 -1.24182794 NaN NaN   NaN
## 16.4811117472266-16.1954831752117  0.29873169 NaN NaN   NaN
## 16.62827488586-16.1954831752117    0.47342380 NaN NaN   NaN
## 16.9589640580649-16.1954831752117  0.02182964 NaN NaN   NaN
## 16.2591032519611-16.2497435459142 -0.05333701 NaN NaN   NaN
## 16.2952384879678-16.2497435459142  0.01592705 NaN NaN   NaN
## 16.3678384448687-16.2497435459142 -1.16301513 NaN NaN   NaN
## 16.4811117472266-16.2497435459142  0.37754450 NaN NaN   NaN
## 16.62827488586-16.2497435459142    0.55223661 NaN NaN   NaN
## 16.9589640580649-16.2497435459142  0.10064245 NaN NaN   NaN
## 16.2952384879678-16.2591032519611  0.06926407 NaN NaN   NaN
## 16.3678384448687-16.2591032519611 -1.10967812 NaN NaN   NaN
## 16.4811117472266-16.2591032519611  0.43088151 NaN NaN   NaN
## 16.62827488586-16.2591032519611    0.60557362 NaN NaN   NaN
## 16.9589640580649-16.2591032519611  0.15397946 NaN NaN   NaN
## 16.3678384448687-16.2952384879678 -1.17894219 NaN NaN   NaN
## 16.4811117472266-16.2952384879678  0.36161744 NaN NaN   NaN
## 16.62827488586-16.2952384879678    0.53630955 NaN NaN   NaN
## 16.9589640580649-16.2952384879678  0.08471539 NaN NaN   NaN
## 16.4811117472266-16.3678384448687  1.54055963 NaN NaN   NaN
## 16.62827488586-16.3678384448687    1.71525174 NaN NaN   NaN
## 16.9589640580649-16.3678384448687  1.26365758 NaN NaN   NaN
## 16.62827488586-16.4811117472266    0.17469211 NaN NaN   NaN
## 16.9589640580649-16.4811117472266 -0.27690205 NaN NaN   NaN
## 16.9589640580649-16.62827488586   -0.45159416 NaN NaN   NaN

The TukeyHSD command, despite my efforts, does not seem to function as intended as my dataset does not include categorical variables. It also spit out much more rows than anticipated, given that only three variables are included in data_la_tukey.