library(knitr)
library(ggplot2)
library(dplyr)
library(tidyr)
Make sure your data and R Markdown files are in the same directory. When loaded your data file will be called brfss2013
. Delete this note when before you submit your work.
#load("C:/Users/GaryFH/Dropbox/Rstudio/R data files/Duke introduction to statistics with R/DukeCourseProject1/_384b2d9eda4b29131fb681b243a7767d_brfss2013.RData")
#class(brfss2013)
#df1<-tbl_df(brfss2013)
#saveRDS(df1,"df1")
df1<-readRDS("df1")
##Select only needed variables
dfrq1<-select(df1,menthlth,exeroft1,exerhmm1,exeroft2,exerhmm2,strength)
##Since all exercise variables are integers you can add them together to create an exercise number - the higher the number the more overall exercise
dfrq1b<-mutate(dfrq1,exercise=(exeroft1+exerhmm1+exeroft2+exerhmm2+strength),mental_health=menthlth)
dfrq1bb<-select(dfrq1b,mental_health,exercise)
dfrq1c<-dfrq1bb
dfrq1c<-drop_na(dfrq1c)
##Group by number of days mental health is not good
aaa<-group_by(dfrq1c,mental_health)
##average excercise level per grouped bad mental health days
bbb<-summarise(aaa,avg=mean(exercise))
#plot(x=bbb$avg,y=bbb$mental_health)
g1<- ggplot()+
geom_point(aes(x=bbb$avg,y=bbb$mental_health),color="red",alpha=.6)+
geom_smooth(aes(x=bbb$avg,y=bbb$mental_health),method="lm",color="dark red",se=F)+labs(x="Avg level of exercise",y="Number of bad mental health days per month")+labs(title="Impact of exercise level on perceived mental health")
g1
fit<-lm(mental_health~.,bbb)
summary(fit)
##
## Call:
## lm(formula = mental_health ~ ., data = bbb)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.062 -6.648 -0.704 6.509 15.350
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -79.45101 30.67981 -2.590 0.01487 *
## avg 0.16318 0.05294 3.082 0.00448 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.026 on 29 degrees of freedom
## Multiple R-squared: 0.2467, Adjusted R-squared: 0.2208
## F-statistic: 9.499 on 1 and 29 DF, p-value: 0.004477
##Select only needed variables
dfrq2<-select(df1,qlhlth2,fruit1,fvgreen,fvorang,vegetab1,sleptim1,exeroft1,exerhmm1,exeroft2,exerhmm2,strength)
##Since all exercise variables are integers you can add them together to create an exercise number - the higher the number the more overall exercise
dfrq2b<-mutate(dfrq2,exercise=(exeroft1+exerhmm1+exeroft2+exerhmm2+strength),diet=(fruit1+fvgreen+fvorang+vegetab1),full_energy=qlhlth2)
dfrq2bb<-select(dfrq2b,full_energy,exercise,diet,sleptim1)
dfrq2c<-dfrq2bb
dfrq2c<-drop_na(dfrq2c)
dfrq2d<-group_by(dfrq2c,full_energy)
dfrq2e<-summarise(dfrq2d,avg_exercise=mean(exercise),avg_diet=mean(diet),avg_sleep=mean(sleptim1))
g2<- ggplot(data = dfrq2e,aes(x=avg_diet,y=full_energy))+
geom_point()+
geom_smooth(aes(x=avg_diet,y=full_energy),method="lm",color="blue",se=F)+labs(x="Avg diet score - higher is better",y="Number of FULL ENERGY days per month")+labs(title="Impact of diet on perceived FULL ENERGY days")
g2
g3<- ggplot(data = dfrq2e,aes(x=avg_exercise,y=full_energy))+
geom_point()+
geom_smooth(aes(x=avg_exercise,y=full_energy),method="lm",color="green",se=F)+labs(x="Avg level of exercise",y="Number of FULL ENERGY days per month")+labs(title="Impact of exercise on perceived FULL ENERGY days")
g3
g4<- ggplot(data = dfrq2e,aes(x=avg_sleep,y=full_energy))+
geom_point()+
geom_smooth(aes(x=avg_sleep,y=full_energy),method="lm",color="orange",se=F)+labs(x="Avg level of sleep",y="Number of FULL ENERGY days per month")+labs(title="Impact of sleep time on perceived FULL ENERGY days")
g4
fit2<-lm(full_energy~avg_exercise,dfrq2e)
fit3<-lm(full_energy~avg_diet,dfrq2e)
fit4<-lm(full_energy~avg_sleep,dfrq2e)
summary(fit2)
##
## Call:
## lm(formula = full_energy ~ avg_exercise, data = dfrq2e)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.420 -9.168 1.006 10.203 14.216
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.52828 9.52874 0.265 0.794
## avg_exercise 0.02210 0.01694 1.305 0.210
##
## Residual standard error: 10.9 on 16 degrees of freedom
## Multiple R-squared: 0.09616, Adjusted R-squared: 0.03967
## F-statistic: 1.702 on 1 and 16 DF, p-value: 0.2104
summary(fit3)
##
## Call:
## lm(formula = full_energy ~ avg_diet, data = dfrq2e)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.906 -6.802 -2.788 7.839 13.702
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -28.31596 14.16317 -1.999 0.06286 .
## avg_diet 0.05084 0.01662 3.058 0.00751 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.111 on 16 degrees of freedom
## Multiple R-squared: 0.3689, Adjusted R-squared: 0.3295
## F-statistic: 9.354 on 1 and 16 DF, p-value: 0.007506
summary(fit4)
##
## Call:
## lm(formula = full_energy ~ avg_sleep, data = dfrq2e)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.996 -10.256 -1.994 11.649 14.986
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.935 25.157 0.395 0.698
## avg_sleep 0.653 3.578 0.183 0.857
##
## Residual standard error: 11.46 on 16 degrees of freedom
## Multiple R-squared: 0.002078, Adjusted R-squared: -0.06029
## F-statistic: 0.03331 on 1 and 16 DF, p-value: 0.8575
The data indicates a minimal correlation (Note that the p-values were relatively high and the Adjusted R values relatively low) between exercise and sleep and the number of perceived days FULL OF ENERGY. The data indicates a strong correlation between diet and the number of perceived days FULL OF ENERGY. Note that the p-value(0.007506) is relatively low which suggests that we should reject the “null hypothesis” thus indicating a correlation. The low Adjusted R-squared(0.3295) would indicate a relatively weak casual relationship and further investigations may uncover confounding variables (such as the potential the possible placebo effect that people who think they eat well would also report having more energy).
##Select only needed variables
dfrq3<-select(df1,painact2,ssbsugar,ssbfrut2,alcday5,avedrnk2,drnk3ge5,maxdrnks)
dfrq3b<-mutate(dfrq3,sugardrink_usage=(ssbsugar+ssbfrut2),alcohol_usage=(alcday5+(5*avedrnk2)+(5*drnk3ge5)+(5*maxdrnks)),hard_to_work_days=painact2)
dfrq3bb<-select(dfrq3b,hard_to_work_days,sugardrink_usage)
dfrq3c<-dfrq3bb
dfrq3c<-drop_na(dfrq3c)
dfrq3d<-select(dfrq3b,hard_to_work_days,alcohol_usage)
dfrq3d<-drop_na(dfrq3d)
g6<- ggplot(data = dfrq3c,aes(x=sugardrink_usage,y=hard_to_work_days))+
geom_point()+
geom_smooth(aes(x=sugardrink_usage,y=hard_to_work_days),method="lm",color="magenta",se=F)+labs(x="Sugar Drink consumption level",y="Number of HARD TO WORK DAYS per month")+labs(title="Impact of sugar drinks on the number of HARD TO WORK DAYS per month")
g6
g7<- ggplot(data = dfrq3d,aes(x=alcohol_usage,y=hard_to_work_days))+
geom_point()+
geom_smooth(aes(alcohol_usage,y=hard_to_work_days),method="lm",color="orange",se=F)+labs(x="Alcohol consumption level",y="Number of HARD TO WORK DAYS per month")+labs(title="Impact of sugar drinks on the number of HARD TO WORK DAYS per month")
g7
fit5<-lm(hard_to_work_days~sugardrink_usage,dfrq3c)
fit6<-lm(hard_to_work_days~alcohol_usage,dfrq3d)
summary(fit5)
##
## Call:
## lm(formula = hard_to_work_days ~ sugardrink_usage, data = dfrq3c)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.956 -5.747 -5.533 -1.300 24.732
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.955890 0.760785 7.829 3.28e-14 ***
## sugardrink_usage -0.001042 0.002235 -0.466 0.641
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.61 on 472 degrees of freedom
## Multiple R-squared: 0.0004601, Adjusted R-squared: -0.001658
## F-statistic: 0.2172 on 1 and 472 DF, p-value: 0.6414
summary(fit6)
##
## Call:
## lm(formula = hard_to_work_days ~ alcohol_usage, data = dfrq3d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.438 -4.016 -3.853 -2.106 26.785
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.12889 2.63333 0.808 0.420
## alcohol_usage 0.00817 0.01191 0.686 0.494
##
## Residual standard error: 8.756 on 151 degrees of freedom
## Multiple R-squared: 0.003108, Adjusted R-squared: -0.003494
## F-statistic: 0.4708 on 1 and 151 DF, p-value: 0.4937