# Descriptive analysis for Depression/life style factors paper 
  # by Kevin Linares: Jan 2015

library(foreign)
setwd("~/Dropbox/My documents/projects/UP Amigos/depression paper/working data")
DepData = read.dta("2008 2009 data for model.dta")
colnames(DepData) = tolower(colnames(DepData))
attach(DepData)

#####################################################################
###   basic summary descriptives of BMI and WC for 2008 & 2009   ####
#####################################################################

# our sample, N=339
sex.t1 = table(sex)
sex.t2 = table(sex_t2)
cbind(sex.t1, sex.t2)
##        sex.t1 sex.t2
## Male      128    129
## Female    211    210
  # participant clave 163528 selected female in T1 & male in T2

# Histogram for gender at T1 & T2
par(mfrow=c(2,1), mar=c(4,4,2,1))
barplot(sex.t1, col=rep(c("light blue", 'light pink')), 
        main="Number of males and females in 2008")
barplot(sex.t2, col=rep(c("light blue", 'light pink')),  
        main="Number of males and females in 2009")

# Age for T1 & T2 descriptives
par(mfrow=c(1,1), mar=c(4,4,3,2))
age_stats = summary(age)
age_stats_T2 = summary(calc_age_t2)
cbind(age_stats, age_stats_T2)
##         age_stats age_stats_T2
## Min.        16.00        17.00
## 1st Qu.     17.00        18.00
## Median      18.00        18.00
## Mean        17.75        18.32
## 3rd Qu.     18.00        19.00
## Max.        20.00        20.00
boxplot(cbind(age_stats, age_stats_T2), names=c("Age 2008", "Age 2009"), 
        col="light salmon", ylab='Ages', main='Age for 2008, 2009 (in years)', 
        cex.main=1.50, ylim=c(16, 20))

# boxplots of Age by gender
par(mfrow=c(2,1), mar=c(4,4,3,2))
boxplot(age~sex, data=DepData, col=rep(c("light blue", 'light pink')),  
        xlab='2008', ylab='Ages')
title(main='Age by gender in 2008 & 2009', cex.main=1.75)
boxplot(calc_age_t2 ~ sex_t2, data=DepData, col=rep(c("light blue", 'light pink')), 
        xlab='2009', ylab='Ages') 

  # frequency of males & females by age in 2008
par(mfrow=c(2,1), mar=c(4,4,3,1))
freq_age_gender_2008 = table(age, sex)
freq_age_gender_2008
##     sex
## age  Male Female
##   16    4     13
##   17   45     78
##   18   57     79
##   19   17     38
##   20    5      3
plot(freq_age_gender_2008, col=rep(c("light blue", 'light pink')), xlab='Age in 2008',
     ylab='Sex', main='Frequency of age group by gender in 2008')
freq_age_gender_2009 = table(calc_age_t2, sex_t2)
freq_age_gender_2009
##            sex_t2
## calc_age_t2 Male Female
##          17   21     40
##          18   50     93
##          19   41     60
##          20   17     17
plot(freq_age_gender_2009, col=rep(c("light blue", 'light pink')), 
     xlab='Age in 2009', ylab='Sex', main='Frequency of age group by gender in 2009')

    ##########################
    ###  Family Variables  ###
    ##########################

# marital status in 2008 & 2009
marital = table(marital_status, useNA = "ifany")
marital_t2 = table(marital_status_t2, useNA = "ifany")
cbind(marital, marital_t2)
##              marital marital_t2
## Single           267        330
## Married            0          3
## Cohabitation       0          2
## Divorced           0          1
## <NA>              72          3
# family income in 2008 & 2009 
  # (2008 metric was transformed into 2009 metric using preexisting labels)
table(sort(income), exclude=NULL)
## 
##   500  1000  2000  2500  3000  3200  3400  3500  3950  4000  4200  4500 
##     1     1     4     5    10     1     1     5     1    15     1     2 
##  4600  4800  5000  5500  5600  6000  6200  6500  6600  7000  7500  8000 
##     1     1    28     1     1    13     2     4     1    11     2    17 
##  8500  9000  9300  9500  9600 10000 12000 13000 13400 14000 15000 16000 
##     3     5     1     1     1    45    11     3     1     4    25     1 
## 17000 18000 19000 20000 22000 23000 24000 25000 28000 30000 32000 35000 
##     1     5     2    18     3     1     2     6     1     6     1     1 
## 40000 50000 58000 60000 80000  <NA> 
##     2     2     1     1     1     0
summary(income_t2)
## Less than $10,000 $10,000 - $14,999 $15,000 - $19,999 $20,000 - $24,999 
##               143                74                43                21 
## $25,000 - $49,999 More than $50,000              NA's 
##                18                 2                38
income.new=c(income)
names(income.new) = c("Less than $10,000", "$10,000 - $14,999", "$15,000 - $19,999", 
                      "$20,000 - $24,999","$25,000 - $49,999")

income.new[income >= 0] = "Less than $10,000"
income.new[income <= 9999] = "Less than $10,000"
income.new[income >= 10000 & income <= 14999] = "$10,000 - $14,999"
income.new[income >= 15000 & income <= 19999] = "$15,000 - $19,999"
income.new[income >= 20000 & income <= 24999] = "$20,000 - $24,999"
income.new[income >= 25000 & income <= 49999] = "$25,000 - $49,999"
income.new[income >= 50000] = "More than $50,000"
income.new=factor(income.new, levels=c("Less than $10,000", "$10,000 - $14,999", 
                                       "$15,000 - $19,999", 
                                       "$20,000 - $24,999","$25,000 - $49,999", 
                                       "More than $50,000"))
summary(income.new)
## Less than $10,000 $10,000 - $14,999 $15,000 - $19,999 $20,000 - $24,999 
##               140                64                34                24 
## $25,000 - $49,999 More than $50,000              NA's 
##                17                 5                55
cbind(table(income.new, useNA = "ifany"), table(income_t2, useNA = "ifany"))
##                   [,1] [,2]
## Less than $10,000  140  143
## $10,000 - $14,999   64   74
## $15,000 - $19,999   34   43
## $20,000 - $24,999   24   21
## $25,000 - $49,999   17   18
## More than $50,000    5    2
## <NA>                55   38
  # for any analysis use income_t2 from 2009 dataset, 2008 is questionable

  ### Parental Education in 2008 & 2009 ###

cbind(table(education_father,  useNA = "ifany"), table(education_father_t2,  
                                                       useNA = "ifany"), 
      table(education_mother,  useNA = "ifany"), table(education_mother_t2,  
                                                       useNA = "ifany"))
##                              [,1] [,2] [,3] [,4]
## none                          144    4  112    8
## grade school                   59   39   74   53
## middle school                  66   69   81   75
## high school                     0   44    1   48
## technical/career school        47   48   62   71
## 4 years of college education    0  101    0   77
## masters or doctorate            0   17    0    4
## do not know                     0   11    0    0
## <NA>                           23    6    9    3
cbind(table(work_father_t2), table(work_mother_t2))
##                                                     [,1] [,2]
## worked                                               267  175
## did not work, but has work                             7    7
## sought work                                            3    5
## was a student                                          2    1
## was dedicated to housework                             2  126
## was retired or a pensioner                            17   15
## was/is permanently unable to work due to disability    2    0
## do no know or prefer to not respond                   33    6
#################################################
##### descriptives on variables of interest #####
#################################################

par(mfrow=c(1,1), mar=c(4,5,4,2))


pa_counts <- table(pa_times_week)
barplot(pa_counts, density=15, xlab="Frequency", ylab="PA Counts", 
        main="Physical Activity Distribution for 1 week", 
        horiz=TRUE, col=c("darkseagreen1"))

pa_counts
## pa_times_week
##   0   1   2   3   4   5   6   7 
## 144  66  19  33  17  25  12   4
    # PA counts by sex in 2008
pa_counts_sex <- table(pa_times_week, sex)
barplot(pa_counts_sex, density=20, xlab="Sex", ylab="PA Counts",
        main="Physical Activity Distribution for 1 week", col=cm.colors(8), 
        legend = rownames(pa_counts_sex), beside=TRUE)

pa_counts_sex
##              sex
## pa_times_week Male Female
##             0   46     98
##             1   29     37
##             2    5     14
##             3   10     23
##             4    9      8
##             5   10     15
##             6    8      4
##             7    2      2
table(pa_duration)
## pa_duration
##   0   1   2   3   5  15  18  20  25  30  40  45  50  60  75  90 100 120 
## 144   2   4   2   2   8   1  15   2  35  12   6  11  36   1   5   1  32 
## 150 180 360 500 
##   1   3   1   1
hist(pa_duration, density=25, breaks=20, xlab="minutes", ylab="PA Counts", 
        main="Physical Activity Distribution for 1 week", col=c("cadetblue2"))

    # TV, Internet, & videogame
table(tv_watching_hours)
## tv_watching_hours
##   0 0.5   1   2   3   4   5   6   7   8   9  10 
##  19   4  76 116  60  31  16   3   3   4   1   1
table(videogame_hours)
## videogame_hours
##   0 0.5   1   2   3   4   5   6   8 
##  47   4 116  84  36  25   9   8   1
tv_vidgame = table(tv_watching_hours, videogame_hours)
tv_vidgame
##                  videogame_hours
## tv_watching_hours  0 0.5  1  2  3  4  5  6  8
##               0   10   1  4  2  2  0  0  0  0
##               0.5  0   0  0  2  0  1  0  0  0
##               1   12   2 29 21  4  3  2  2  1
##               2    9   1 51 26 16 10  2  0  0
##               3    9   0 15 18  7  6  3  2  0
##               4    2   0 12  6  4  2  1  3  0
##               5    2   0  5  3  2  3  0  0  0
##               6    1   0  0  2  0  0  0  0  0
##               7    1   0  0  1  1  0  0  0  0
##               8    0   0  0  3  0  0  0  1  0
##               9    0   0  0  0  0  0  1  0  0
##               10   1   0  0  0  0  0  0  0  0
par(mfrow=c(2,1), mar=c(4,5,4,2))
hist(tv_watching_hours, density=20, breaks=10, xlab="Hours", ylab="Frequency", 
     main="Frequency for hours spent watching TV", 
     col=c("cadetblue4"))

hist(tv_vidgame, density=20, breaks=10, xlab="Hours", ylab="Frequency", 
     main="Frequency for hours spent playing videogames", 
     col=c("cadetblue4"))

par(mfrow=c(1,1), mar=c(4,5,4,2))

    # internet & videogame variables 2009 data

table(internet2_t2)
## internet2_t2
##                       Daily Two to three times per week 
##                         127                         115 
##                      Weekly                     Monthly 
##                          52                          25 
##      Less than once a month 
##                          19
names_int_t2 = c("Daily", "2-3 times per week", "Weekly", "Monthly", "< once a month") 
barplot(table(internet2_t2), names=names_int_t2, density=20, col="seagreen3", 
        xlab="Often internet use", ylab="Frequency", cex.names=.75, 
        main="Barplot for how often Internet is used")

      ## By sex
par(mfrow=c(2,1), mar=c(4,5,4,2))
internet2_t2_counts = table(sex_t2, internet2_t2)
internet_t2_relfreq = internet2_t2_counts/sum(internet2_t2_counts)
barplot(internet2_t2_counts, names=names_int_t2, density=15, 
        col=c("seagreen2", "royalblue1"), xlab="Often internet use", 
        ylab="Frequency", cex.names=.75,  
        main="Barplot for how often Internet is used by sex")
legend('topright', rownames(internet2_t2_counts), density=15, 
       fill=c("seagreen2", "royalblue1"))
barplot(internet_t2_relfreq, names=names_int_t2, density=15, 
        col=c("seagreen2", "royalblue1"), xlab="Often internet use", 
        ylab="Relative Frequency", cex.names=.75, 
        main="Relative Frequency Boxplot for how often Internet is used")

par(mfrow=c(1,2), mar=c(4,4,3,2))

count_tv_week_t2 = table(tv_week_t2)
count_tv_week_t2
## tv_week_t2
##           0 min          15 min          30 min            1 hr 
##               8              14              22              99 
##           2 hrs           3 hrs           4 hrs More than 5 hrs 
##              94              52              31              18
lbls_tv_week_t2 =  names(count_tv_week_t2)
pct_tv_week_t2 = round(count_tv_week_t2/sum(count_tv_week_t2)*100)
lbls_tv_week_t2 = paste(lbls_tv_week_t2, pct_tv_week_t2)
lbls_tv_week_t2 = paste(lbls_tv_week_t2,"%",sep="")
pie(count_tv_week_t2, labels = lbls_tv_week_t2, col=cm.colors(8),
    main="Pie chart for how often 
    television is watched during the week")

count_tv_weekend_t2 = table(tv_weekend_t2)
count_tv_weekend_t2
## tv_weekend_t2
##           0 min          15 min          30 min            1 hr 
##              13              11              21              39 
##           2 hrs           3 hrs           4 hrs More than 5 hrs 
##              81              86              51              34
lbls_tv_weekend_t2 =  names(count_tv_weekend_t2)
pct_tv_weekend_t2 = round(count_tv_weekend_t2/sum(count_tv_weekend_t2)*100)
lbls_tv_weekend_t2 = paste(lbls_tv_weekend_t2, pct_tv_weekend_t2)
lbls_tv_weekend_t2 = paste(lbls_tv_weekend_t2,"%",sep="")
pie(count_tv_weekend_t2, labels = lbls_tv_weekend_t2, col=cm.colors(8),
    main="Pie chart for how often 
    television is watched on weekends")

count_game_week_t2 = table(game_week_t2)
count_game_week_t2
## game_week_t2
##           0 min          15 min          30 min            1 hr 
##              53              23              45              80 
##           2 hrs           3 hrs           4 hrs More than 5 hrs 
##              60              47              22               9
lbls_game_week_t2 =  names(count_game_week_t2)
pct_game_week_t2 = round(count_game_week_t2/sum(count_game_week_t2)*100)
lbls_game_week_t2 = paste(lbls_game_week_t2, pct_game_week_t2)
lbls_game_week_t2 = paste(lbls_game_week_t2,"%",sep="")
pie(count_game_week_t2, labels = lbls_game_week_t2, col=topo.colors(8),
    main="Pie chart for how often 
    videogames are played during the week")

count_game_weekend_t2 = table(game_weekend_t2)
count_game_weekend_t2
## game_weekend_t2
##           0 min          15 min          30 min            1 hr 
##              85              23              44              59 
##           2 hrs           3 hrs           4 hrs More than 5 hrs 
##              53              36              18              17
lbls_game_weekend_t2 =  names(count_game_weekend_t2)
pct_game_weekend_t2 = round(count_game_weekend_t2/sum(count_game_weekend_t2)*100)
lbls_game_weekend_t2 = paste(lbls_game_weekend_t2, pct_game_weekend_t2)
lbls_game_weekend_t2 = paste(lbls_game_weekend_t2,"%",sep="")
pie(count_game_weekend_t2, labels = lbls_game_weekend_t2, col=topo.colors(8),
    main="Pie chart for how often 
    videogames are played on weekends")

  ## contingency tables for watching TV during meals & if tv is visible
    # tv1_t2 = is there a tele present from where they usually eat
    # tv2_t2 =  do they watch the tele when they eat
table(tv1_t2, useNA = "ifany")
## tv1_t2
##  yes   no <NA> 
##  197  137    5
table(tv2_t2, useNA = "ifany")
## tv2_t2
##  yes   no <NA> 
##  153  179    7
freq_tv1_t2 = table(tv1_t2)
freq_tv2_t2 = table(tv2_t2)
table(is.na(tv1_t2))
## 
## FALSE  TRUE 
##   334     5
table(is.na(tv2_t2))
## 
## FALSE  TRUE 
##   332     7
relfreq_tv1_t2 = freq_tv1_t2/334 # 334 total valid responses
relfreq_tv2_t2 = freq_tv2_t2/334 # 334 total valid responses
cbind(freq_tv1_t2, relfreq_tv1_t2 ,freq_tv2_t2, relfreq_tv2_t2)
##     freq_tv1_t2 relfreq_tv1_t2 freq_tv2_t2 relfreq_tv2_t2
## yes         197      0.5898204         153      0.4580838
## no          137      0.4101796         179      0.5359281
    ## conditional distribution
library(gmodels)
joint = CrossTable(tv1_t2, tv2_t2, prop.chisq=TRUE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  332 
## 
##  
##              | tv2_t2 
##       tv1_t2 |       yes |        no | Row Total | 
## -------------|-----------|-----------|-----------|
##          yes |       141 |        55 |       196 | 
##              |    28.430 |    24.300 |           | 
##              |     0.719 |     0.281 |     0.590 | 
##              |     0.922 |     0.307 |           | 
##              |     0.425 |     0.166 |           | 
## -------------|-----------|-----------|-----------|
##           no |        12 |       124 |       136 | 
##              |    40.972 |    35.021 |           | 
##              |     0.088 |     0.912 |     0.410 | 
##              |     0.078 |     0.693 |           | 
##              |     0.036 |     0.373 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |       153 |       179 |       332 | 
##              |     0.461 |     0.539 |           | 
## -------------|-----------|-----------|-----------|
## 
## 
joint
## $t
##      y
## x     yes  no
##   yes 141  55
##   no   12 124
## 
## $prop.row
##      y
## x            yes         no
##   yes 0.71938776 0.28061224
##   no  0.08823529 0.91176471
## 
## $prop.col
##      y
## x            yes         no
##   yes 0.92156863 0.30726257
##   no  0.07843137 0.69273743
## 
## $prop.tbl
##      y
## x            yes         no
##   yes 0.42469880 0.16566265
##   no  0.03614458 0.37349398
    ## joint distribution plot

par(mfrow=c(1,1), mar=c(2,3,2,1))

joint_cnts = joint$t
barplot(joint_cnts, beside=FALSE, density=20, col=cm.colors(2), ylab='Frequency', 
        xlab='TV visible from where they eat',
        main='Television watching during meals: stacked Bar plot')
legend("bottomleft", c('TV during meals', 'No TV during meals'), 
       density=20, cex=.75, pch=10, col=cm.colors(2))

# Most people that have a tele visible (141) from where they eat, 
  ## tend to watch tele during meals
        ## conditional distribution plot
WatchTV_given_TVvis = joint$prop.col
barplot(WatchTV_given_TVvis, beside=FALSE, density=20, col=cm.colors(2), 
        ylab='Proportion of participants that watch TV during meals', 
        xlab='TV visible from where they eat',
        main='Proportion of TV watching while eating given TV visible')
legend("bottomleft", c('TV during meals', 'No TV during meals'), density=20, 
       cex=.75, pch=10, col=cm.colors(2))

# The proportion attributed too for watching tv during meals is .92% 
    ## given that a tv is visible from where they eat.
# The proportion attributed too NOT watching tv during meals is .69% 
    ## given that a tv is NOT visible from where they eat

# some extra variables of interest #
table(frs_t2, useNA = "ifany")
## frs_t2
##    1    2    3    4    5    6    7 <NA> 
##    4   40  103  116   58   12    5    1
table(self_health_t2, useNA = "ifany")
## self_health_t2
##   A   B   C   D   E   F 
##  65 213  58   1   2   0
table(fastfood_t2, useNA = "ifany")
## fastfood_t2
##     Never 1-2 times 3-4 times 5-6 times   7 times      <NA> 
##       135       165        20         9         1         9
hist(frs_t2, density=20, breaks=7, col=terrain.colors(7), 
        main='Histogram for figure scale')

barplot(table(self_health_t2), density=25, col=cm.colors(6), 
        main='Barplot for self health scale')

barplot(table(fastfood_t2), density=25, col=cm.colors(5), 
        main='Barplot for fast food consumption')

# weight status variables (BMI, waist circumference)

# saves descriptives of bmi and wc at T1 & T2 as summary_T1/T2

summary_T1 = summary(cbind(bmi, wc))
summary_T2 = summary(cbind(bmi_t2, wc_t2))
cbind(summary_T1, summary_T2)
##       bmi                wc               bmi_t2            wc_t2         
##  "Min.   :15.05  " "Min.   : 61.00  " "Min.   :16.44  " "Min.   : 58.50  "
##  "1st Qu.:20.24  " "1st Qu.: 74.00  " "1st Qu.:20.53  " "1st Qu.: 71.00  "
##  "Median :22.55  " "Median : 81.00  " "Median :22.60  " "Median : 77.00  "
##  "Mean   :23.24  " "Mean   : 82.83  " "Mean   :23.54  " "Mean   : 79.16  "
##  "3rd Qu.:25.68  " "3rd Qu.: 89.00  " "3rd Qu.:25.59  " "3rd Qu.: 85.00  "
##  "Max.   :41.91  " "Max.   :125.00  " "Max.   :40.33  " "Max.   :132.00  "
##  NA                "NA's   :2  "      NA                "NA's   :4  "
par(mfrow=c(4,1), mar=c(4,5,2,1))

hist(subset(bmi, sex=="Male"), col="dark blue", density=20, breaks=20, 
     prob=TRUE, xlab='BMI', 
     main="Histogram for BMI among males in 2008 with density curve")
lines(density(bmi),col = "red")
          
hist(subset(bmi, sex=="Female"), col="dark green", density=20, breaks=20, 
     prob=TRUE, xlab='BMI', 
     main="Histogram for BMI among females in 2008 with density curve")
lines(density(bmi),col = "red")

hist(subset(bmi_t2, sex_t2=="Male"), col="dark blue", density=20, breaks=20, 
     prob=TRUE, xlab='BMI', 
     main="Histogram for BMI among males in 2009 with density curve")
lines(density(bmi_t2),col = "red")

hist(subset(bmi_t2, sex_t2=="Female"), col="dark green", density=20, breaks=20, 
     prob=TRUE, xlab='BMI', 
     main="Histogram for BMI among females in 2009 with density curve")
lines(density(bmi_t2),col = "red")

  # scatter plots of BMI & WC, BMI & BMI_t2, wc & wc_t2, and BMI_t2 & WC_t2 by gender

par(mfrow=c(1,1), mar=c(5,5,4,1))

  # BMI x WC
with(DepData, plot(bmi, wc, pch=18, col=rep(c("blue", "darkgreen")), 
                   xlab='BMI in 2008', ylab='Waist circumference (in Centimeters)'))
title(main='Scatterplot of BMI & Waist Circumference (2008) 
      BMI regression line set to 25 (>25 = overweight/obese)', cex.main = 1.25)
legend('topleft', c("BMI", "Waist Circumference"), pch=c(18), 
       col=c("blue", "darkgreen"), cex=1.00)
abline(lsfit(bmi, wc))
## Warning in lsfit(bmi, wc): 2 missing values deleted
abline(v=25.0, lwd=2, lty=2, col='red')

  # BMI x BMI_2
with(DepData, plot(bmi, bmi_t2, pch=18, col=rep(c("blue", "darkgreen")), 
                   xlab='BMI in 2008', ylab='BMI in 2009'))
title(main='Scatterplot of BMI in 2008 & 2009 
      BMI (2008) regression line set to 25 (>25 = overweight/obese)', 
      cex.main = 1.25)
legend('topleft', c("BMI 2008", "BMI 2008"), pch=c(18), 
       col=c("blue", "darkgreen"), cex=1.00)
abline(lsfit(bmi, bmi_t2))
abline(v=25.0, lwd=2, lty=2, col='red')

# wc & wc_t2
with(DepData, plot(wc, wc_t2, pch=18, col=rep(c("blue", "darkgreen")), 
                   xlab='Waist Circumference in 2008', 
                   ylab='Waist Circumference in 2009'))
title(main='Scatterplot of Waist Circumference (in Centimeters) 2008 & 2009', 
      cex.main = 1.25)
legend('topleft', c("Waist Circumference 2008", "Waist Circumference 2009"), 
       pch=c(18), col=c("blue", "darkgreen"), cex=1.00)
abline(lsfit(wc, wc_t2))
## Warning in lsfit(wc, wc_t2): 6 missing values deleted

# BMI_t2 & WC_t2
with(DepData, plot(bmi_t2, wc_t2, pch=18, col=rep(c("blue", "darkgreen")), 
                   xlab='BMI in 2009', ylab='Waist circumference (in Centimeters)'))
title(main='Scatterplot of BMI & Waist Circumference (2009) 
      BMI regression line set to 25 (>25 = overweight/obese)', cex.main = 1.25)
legend('topleft', c("BMI", "Waist Circumference"), pch=c(18), 
       col=c("blue", "darkgreen"), cex=1.00)
abline(lsfit(bmi_t2, wc_t2))
## Warning in lsfit(bmi_t2, wc_t2): 4 missing values deleted
abline(v=25.0, lwd=2, lty=2, col='red')

# BMI x WC scatterplots by gender

library(lattice)

par(mfrow=c(1,1), mar=c(4,5,4,2))

panel.function = function(x, y) {
  panel.xyplot(x, y)
  panel.abline(lm(y ~ x))
}

  # BMI x WC by sex
xyplot(wc ~ bmi | sex, data=DepData, origin=0, panel = panel.function, 
       xlab='BMI in 2008', ylab='Waist circumference (in centimeters)',
       main='Scatterplot for BMI & Waist Circumference (2008) across sex', 
       cex.main = 2.0)

  # BMI x BMI_T2 by sex
xyplot(bmi_t2 ~ bmi | sex, data=DepData, origin=0, panel = panel.function, 
       xlab='BMI in 2008', ylab='BMI in 2009',
       main='Scatterplot for BMI in 2008 & 2009 across sex', cex.main = 2.0)

  # WC & WC_t2 by sex
xyplot(wc_t2 ~ wc | sex, data=DepData, origin=0, panel = panel.function, 
       xlab='Waist Circumference (centimeters) in 2008', 
       ylab='Waist Circumference (centimeters) in 2009',
       main='Scatterplot for Waist Circumference in 2008 & 2009 across sex', 
       cex.main = 2.0)

# WC_t2 x BMI_T2 by sex
xyplot(wc_t2 ~ bmi_t2 | sex, data=DepData, origin=0, panel = panel.function, 
       xlab='BMI in 2009', ylab='Waist Circumference (centimeters) 2009',
       main='Scatterplot for BMI & Waist Circumference in (2009) across sex', 
       cex.main = 2.0)

library(ggplot2) 

qplot(bmi_t2, data=DepData, geom="density", fill=internet2_t2, alpha=I(.5),
      main="Distribution of BMI", xlab="BMI",
      ylab="Density")

#######################################################
######## Some analysis on BMI & Depression ############
#######################################################

# Difference between BMI 2008 & BMI 2009

  # paired t-test for bmi 2008-2009
t.test(bmi, bmi_t2, paired=TRUE)
## 
##  Paired t-test
## 
## data:  bmi and bmi_t2
## t = -3.3494, df = 338, p-value = 0.0009013
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.4775823 -0.1241834
## sample estimates:
## mean of the differences 
##              -0.3008829
  # 
 # sig. differences in BMI suggesst further examining the difference as a seperate variables

BMI_Dif = bmi- bmi_t2
summary(BMI_Dif)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -9.3590 -1.0530 -0.2081 -0.3009  0.5312  5.7020
plot(sort(BMI_Dif), ylab="Difference in BMI", xlab="cases", 
     main="Sorted scatter plot for BMI difference between 2008 & 2009")

qplot(BMI_Dif, data=DepData, geom="density", fill=sex, alpha=I(.5),
      main="Distribution of BMI", xlab="BMI Difference",
      ylab="Density")

qplot(sex, BMI_Dif, data=DepData, geom=c("boxplot"), fill=sex, alpha=I(.5),
      main="Boxplot of BMI difference by Sex", xlab="",
      ylab="BMI difference")

  # 1 - way ANOVE for BMI difference across gender
BMI_sex_dif_fit = aov(BMI_Dif ~ sex, data=DepData)
summary(BMI_sex_dif_fit) 
##              Df Sum Sq Mean Sq F value Pr(>F)
## sex           1    1.0   1.003   0.366  0.546
## Residuals   337  923.6   2.741
# CES-D data
ces_d = data.frame(ces = rpois(20, 34))
tabList <- lapply(ces_d, table)
cbind(table("cesd_1", "cesd_1_t2", "cesd_2", "cesd_2_t2", "cesd_3", 
      "cesd_3_t2", "cesd_4", "cesd_4_t2", "cesd_5", "cesd_5_t2", 
      "cesd_6", "cesd_6_t2", "cesd_7", "cesd_7_t2", "cesd_8", 
      "cesd_8_t2", "cesd_9", "cesd_9_t2", "cesd_10", "cesd_10_t2"))
##      [,1]
## [1,]    1