#set Dir first
library(tidyverse)
## -- Attaching packages ------------- tidyverse 1.3.0 --
## √ ggplot2 3.3.2     √ purrr   0.3.4
## √ tibble  3.0.3     √ dplyr   1.0.2
## √ tidyr   1.1.2     √ stringr 1.4.0
## √ readr   1.3.1     √ forcats 0.5.0
## -- Conflicts ---------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(rmarkdown)
library(lme4)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
library(emmeans)
library(effects)
## Loading required package: carData
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
states_data <- read_rds("dataSets/states.rds")
tail(states_data)
##            state  region     pop  area density metro waste energy miles toxic
## 46       Vermont N. East  563000  9249   60.87  23.4  0.69    232  10.4  1.81
## 47      Virginia   South 6187000 39598  156.25  72.5  1.45    306   9.7 12.87
## 48    Washington    West 4867000 66582   73.10  81.7  1.05    389   9.2  8.51
## 49 West Virginia   South 1793000 24087   74.44  36.4  0.95    415   8.6 21.30
## 50     Wisconsin Midwest 4892000 54314   90.07  67.4  0.70    288   9.1  9.20
## 51       Wyoming    West  454000 97105    4.68  29.6  0.70    786  12.8 25.51
##     green house senate csat vsat msat percent expense income high college
## 46  15.17    85     94  890  424  466      68    6738 34.717 80.8    24.3
## 47  18.72    33     54  890  424  466      60    4836 38.838 75.2    24.5
## 48  16.51    52     64  913  433  480      49    5000 36.338 83.8    22.9
## 49  51.14    48     57  926  441  485      17    4911 24.233 66.0    12.3
## 50  20.58    47     57 1023  481  542      11    5871 34.309 78.6    17.7
## 51 114.40     0     10  980  466  514      13    5723 31.576 83.0    18.8
head(states_data)
##        state region      pop   area density metro waste energy miles toxic
## 1    Alabama  South  4041000  52423   77.08  67.4  1.11    393  10.5 27.86
## 2     Alaska   West   550000 570374    0.96  41.1  0.91    991   7.2 37.41
## 3    Arizona   West  3665000 113642   32.25  79.0  0.79    258   9.7 19.65
## 4   Arkansas  South  2351000  52075   45.15  40.1  0.85    330   8.9 24.60
## 5 California   West 29760000 155973  190.80  95.7  1.51    246   8.7  3.26
## 6   Colorado   West  3294000 103730   31.76  81.5  0.73    273   8.3  2.25
##   green house senate csat vsat msat percent expense income high college
## 1 29.25    30     10  991  476  515       8    3627 27.498 66.9    15.7
## 2    NA     0     20  920  439  481      41    8330 48.254 86.6    23.0
## 3 18.37    13     33  932  442  490      26    4309 32.093 78.7    20.3
## 4 26.04    25     37 1005  482  523       6    3700 24.643 66.3    13.3
## 5 15.65    50     47  897  415  482      47    4491 41.716 76.2    23.4
## 6 21.89    36     58  959  453  506      29    5064 35.123 84.4    27.0
sts_ex_sat <- 
  states_data %>% 
  select(expense, csat)
cor(sts_ex_sat, use = "pairwise") 
##            expense       csat
## expense  1.0000000 -0.4662978
## csat    -0.4662978  1.0000000
qplot(x = expense, y = csat, geom = "point", data = sts_ex_sat)

sat_mod1 <- lm(csat ~ 1 + expense, # regression formula
              data = states_data) # data 
lm(csat ~ 1 + expense, # regression formula
   data = states_data) # data 
## 
## Call:
## lm(formula = csat ~ 1 + expense, data = states_data)
## 
## Coefficients:
## (Intercept)      expense  
##  1060.73244     -0.02228
sat_mod1
## 
## Call:
## lm(formula = csat ~ 1 + expense, data = states_data)
## 
## Coefficients:
## (Intercept)      expense  
##  1060.73244     -0.02228
summary(sat_mod1)
## 
## Call:
## lm(formula = csat ~ 1 + expense, data = states_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -131.811  -38.085    5.607   37.852  136.495 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.061e+03  3.270e+01   32.44  < 2e-16 ***
## expense     -2.228e-02  6.037e-03   -3.69 0.000563 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 59.81 on 49 degrees of freedom
## Multiple R-squared:  0.2174, Adjusted R-squared:  0.2015 
## F-statistic: 13.61 on 1 and 49 DF,  p-value: 0.0005631
summary(sat_mod1) %>% coef() 
##                  Estimate   Std. Error   t value     Pr(>|t|)
## (Intercept) 1060.73244395 32.700896736 32.437412 8.878411e-35
## expense       -0.02227565  0.006037115 -3.689783 5.630902e-04
lm(csat ~ 1 + expense + percent, data = states_data) %>% 
  summary() 
## 
## Call:
## lm(formula = csat ~ 1 + expense + percent, data = states_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -62.921 -24.318   1.741  15.502  75.623 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 989.807403  18.395770  53.806  < 2e-16 ***
## expense       0.008604   0.004204   2.046   0.0462 *  
## percent      -2.537700   0.224912 -11.283 4.21e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 31.62 on 48 degrees of freedom
## Multiple R-squared:  0.7857, Adjusted R-squared:  0.7768 
## F-statistic: 88.01 on 2 and 48 DF,  p-value: < 2.2e-16
sat_mod2<-lm(csat ~ 1 + expense + percent, data = states_data)
summary(sat_mod2)
## 
## Call:
## lm(formula = csat ~ 1 + expense + percent, data = states_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -62.921 -24.318   1.741  15.502  75.623 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 989.807403  18.395770  53.806  < 2e-16 ***
## expense       0.008604   0.004204   2.046   0.0462 *  
## percent      -2.537700   0.224912 -11.283 4.21e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 31.62 on 48 degrees of freedom
## Multiple R-squared:  0.7857, Adjusted R-squared:  0.7768 
## F-statistic: 88.01 on 2 and 48 DF,  p-value: < 2.2e-16
sat_mod3<- lm(csat ~ 1 + expense + house + senate,
                     data = na.omit(states_data))

summary(sat_mod3) %>% coef()
##                  Estimate   Std. Error    t value     Pr(>|t|)
## (Intercept) 1082.93438041 38.633812740 28.0307405 1.067795e-29
## expense       -0.01870832  0.009691494 -1.9303852 6.001998e-02
## house         -1.44243754  0.600478382 -2.4021473 2.058666e-02
## senate         0.49817861  0.513561356  0.9700469 3.373256e-01
# add the interaction to the model
sat_expense_by_percent <- lm(csat ~ 1 + expense + income + expense : income, data = states_data)

# same as above, but shorter syntax
sat_expense_by_percent <- lm(csat ~ 1 + expense * income, data = states_data) 

# show the regression coefficients table
summary(sat_expense_by_percent)
## 
## Call:
## lm(formula = csat ~ 1 + expense * income, data = states_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -133.725  -29.478   -5.475   26.267  134.190 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     1.380e+03  1.721e+02   8.021 2.37e-10 ***
## expense        -6.384e-02  3.270e-02  -1.952   0.0569 .  
## income         -1.050e+01  4.991e+00  -2.103   0.0408 *  
## expense:income  1.385e-03  8.636e-04   1.603   0.1155    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 57.75 on 47 degrees of freedom
## Multiple R-squared:  0.3002, Adjusted R-squared:  0.2555 
## F-statistic:  6.72 on 3 and 47 DF,  p-value: 0.0007246
levels(states_data$region)
## [1] "West"    "N. East" "South"   "Midwest"
sat_region <- lm(csat ~ 1 + region, data = states_data) 
summary(sat_region)
## 
## Call:
## lm(formula = csat ~ 1 + region, data = states_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -145.083  -29.389   -3.778   35.192   85.000 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     946.31      14.80  63.958  < 2e-16 ***
## regionN. East   -56.75      23.13  -2.453  0.01800 *  
## regionSouth     -16.31      19.92  -0.819  0.41719    
## regionMidwest    63.78      21.36   2.986  0.00451 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 53.35 on 46 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.3853, Adjusted R-squared:  0.3452 
## F-statistic:  9.61 on 3 and 46 DF,  p-value: 4.859e-05
states_data <- states_data %>%
  mutate(region = relevel(region, ref = "Midwest"))
mod_region <- lm(csat ~ 1 + region, data = states_data)
mod_region %>%
  emmeans(specs = pairwise ~ region)
## $emmeans
##  region  emmean   SE df lower.CL upper.CL
##  Midwest   1010 15.4 46      979     1041
##  West       946 14.8 46      917      976
##  N. East    890 17.8 46      854      925
##  South      930 13.3 46      903      957
## 
## Confidence level used: 0.95 
## 
## $contrasts
##  contrast          estimate   SE df t.ratio p.value
##  Midwest - West        63.8 21.4 46  2.986  0.0226 
##  Midwest - N. East    120.5 23.5 46  5.124  <.0001 
##  Midwest - South       80.1 20.4 46  3.931  0.0016 
##  West - N. East        56.8 23.1 46  2.453  0.0812 
##  West - South          16.3 19.9 46  0.819  0.8453 
##  N. East - South      -40.4 22.2 46 -1.820  0.2774 
## 
## P value adjustment: tukey method for comparing a family of 4 estimates
e1<-mod_region %>%
  emmeans(specs = pairwise ~ region)
plot(e1)

emmip(sat_expense_by_percent, expense ~ income, cov.reduce = range)

emmip(sat_expense_by_percent, income ~ expense, cov.reduce = range)

# print summary coefficients table
summary(mod_region) 
## 
## Call:
## lm(formula = csat ~ 1 + region, data = states_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -145.083  -29.389   -3.778   35.192   85.000 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    1010.08      15.40  65.590  < 2e-16 ***
## regionWest      -63.78      21.36  -2.986 0.004514 ** 
## regionN. East  -120.53      23.52  -5.124  5.8e-06 ***
## regionSouth     -80.08      20.37  -3.931 0.000283 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 53.35 on 46 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.3853, Adjusted R-squared:  0.3452 
## F-statistic:  9.61 on 3 and 46 DF,  p-value: 4.859e-05
#install.packages("devtools")
#devtools::install_github("neuropsychology/report")
##install.packages("flextable")
library(modelsummary)
## Warning: package 'modelsummary' was built under R version 4.0.3
modelsummary(sat_mod1, "table.docx")
models <- list(
  sat_mod1,
  sat_mod2 ,
  sat_mod3    
)
#modelsummary(models,"table3.docx",stars = TRUE, stars_note = TRUE)
modelsummary(models,stars = TRUE, stars_note = TRUE)
Model 1 Model 2 Model 3
(Intercept) 1060.732*** 989.807*** 1082.934***
(32.701) (18.396) (38.634)
expense -0.022*** 0.009** -0.019*
(0.006) (0.004) (0.010)
percent -2.538***
(0.225)
house -1.442**
(0.600)
senate 0.498
(0.514)
Num.Obs. 51 51 48
R2 0.217 0.786 0.283
R2 Adj. 0.201 0.777 0.234
AIC 566.0 501.9 532.3
BIC 571.8 509.7 541.6
Log.Lik. -279.999 -246.967 -261.127
F 13.615 88.009 5.780
* p < 0.1, ** p < 0.05, *** p < 0.01