setwd("D:/Rmodels")
library(tidyverse)
## -- Attaching packages -------------------------------------------------------------------------------- tidyverse 1.3.0 --
## √ ggplot2 3.3.2     √ purrr   0.3.4
## √ tibble  3.0.3     √ dplyr   1.0.2
## √ tidyr   1.1.2     √ stringr 1.4.0
## √ readr   1.3.1     √ forcats 0.5.0
## -- Conflicts ----------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(rmarkdown)
library(lme4)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
library(emmeans)
## Warning: package 'emmeans' was built under R version 4.0.3
library(effects)
## Warning: package 'effects' was built under R version 4.0.3
## Loading required package: carData
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
library(ggeffects)
## Warning: package 'ggeffects' was built under R version 4.0.3
states_data <- read_rds("dataSets/states.rds")
#View(states_data)
tail(states_data)
##            state  region     pop  area density metro waste energy miles toxic
## 46       Vermont N. East  563000  9249   60.87  23.4  0.69    232  10.4  1.81
## 47      Virginia   South 6187000 39598  156.25  72.5  1.45    306   9.7 12.87
## 48    Washington    West 4867000 66582   73.10  81.7  1.05    389   9.2  8.51
## 49 West Virginia   South 1793000 24087   74.44  36.4  0.95    415   8.6 21.30
## 50     Wisconsin Midwest 4892000 54314   90.07  67.4  0.70    288   9.1  9.20
## 51       Wyoming    West  454000 97105    4.68  29.6  0.70    786  12.8 25.51
##     green house senate csat vsat msat percent expense income high college
## 46  15.17    85     94  890  424  466      68    6738 34.717 80.8    24.3
## 47  18.72    33     54  890  424  466      60    4836 38.838 75.2    24.5
## 48  16.51    52     64  913  433  480      49    5000 36.338 83.8    22.9
## 49  51.14    48     57  926  441  485      17    4911 24.233 66.0    12.3
## 50  20.58    47     57 1023  481  542      11    5871 34.309 78.6    17.7
## 51 114.40     0     10  980  466  514      13    5723 31.576 83.0    18.8
head(states_data)
##        state region      pop   area density metro waste energy miles toxic
## 1    Alabama  South  4041000  52423   77.08  67.4  1.11    393  10.5 27.86
## 2     Alaska   West   550000 570374    0.96  41.1  0.91    991   7.2 37.41
## 3    Arizona   West  3665000 113642   32.25  79.0  0.79    258   9.7 19.65
## 4   Arkansas  South  2351000  52075   45.15  40.1  0.85    330   8.9 24.60
## 5 California   West 29760000 155973  190.80  95.7  1.51    246   8.7  3.26
## 6   Colorado   West  3294000 103730   31.76  81.5  0.73    273   8.3  2.25
##   green house senate csat vsat msat percent expense income high college
## 1 29.25    30     10  991  476  515       8    3627 27.498 66.9    15.7
## 2    NA     0     20  920  439  481      41    8330 48.254 86.6    23.0
## 3 18.37    13     33  932  442  490      26    4309 32.093 78.7    20.3
## 4 26.04    25     37 1005  482  523       6    3700 24.643 66.3    13.3
## 5 15.65    50     47  897  415  482      47    4491 41.716 76.2    23.4
## 6 21.89    36     58  959  453  506      29    5064 35.123 84.4    27.0
sts_ex_sat <- 
  states_data %>% 
  select(expense, csat)
cor(sts_ex_sat, use = "pairwise") 
##            expense       csat
## expense  1.0000000 -0.4662978
## csat    -0.4662978  1.0000000
qplot(x = expense, y = csat, geom = "point", data = sts_ex_sat)

sat_mod1 <- lm(csat ~ 1 + expense, # regression formula
              data = states_data) # data 
lm(csat ~ 1 + expense, # regression formula
   data = states_data) # data 
## 
## Call:
## lm(formula = csat ~ 1 + expense, data = states_data)
## 
## Coefficients:
## (Intercept)      expense  
##  1060.73244     -0.02228
sat_mod1
## 
## Call:
## lm(formula = csat ~ 1 + expense, data = states_data)
## 
## Coefficients:
## (Intercept)      expense  
##  1060.73244     -0.02228
summary(sat_mod1)
## 
## Call:
## lm(formula = csat ~ 1 + expense, data = states_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -131.811  -38.085    5.607   37.852  136.495 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.061e+03  3.270e+01   32.44  < 2e-16 ***
## expense     -2.228e-02  6.037e-03   -3.69 0.000563 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 59.81 on 49 degrees of freedom
## Multiple R-squared:  0.2174, Adjusted R-squared:  0.2015 
## F-statistic: 13.61 on 1 and 49 DF,  p-value: 0.0005631
summary(sat_mod1) %>% coef() 
##                  Estimate   Std. Error   t value     Pr(>|t|)
## (Intercept) 1060.73244395 32.700896736 32.437412 8.878411e-35
## expense       -0.02227565  0.006037115 -3.689783 5.630902e-04
lm(csat ~ 1 + expense + percent, data = states_data) %>% 
  summary() 
## 
## Call:
## lm(formula = csat ~ 1 + expense + percent, data = states_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -62.921 -24.318   1.741  15.502  75.623 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 989.807403  18.395770  53.806  < 2e-16 ***
## expense       0.008604   0.004204   2.046   0.0462 *  
## percent      -2.537700   0.224912 -11.283 4.21e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 31.62 on 48 degrees of freedom
## Multiple R-squared:  0.7857, Adjusted R-squared:  0.7768 
## F-statistic: 88.01 on 2 and 48 DF,  p-value: < 2.2e-16
sat_mod2<-lm(csat ~ 1 + expense + percent, data = states_data)
summary(sat_mod2)
## 
## Call:
## lm(formula = csat ~ 1 + expense + percent, data = states_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -62.921 -24.318   1.741  15.502  75.623 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 989.807403  18.395770  53.806  < 2e-16 ***
## expense       0.008604   0.004204   2.046   0.0462 *  
## percent      -2.537700   0.224912 -11.283 4.21e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 31.62 on 48 degrees of freedom
## Multiple R-squared:  0.7857, Adjusted R-squared:  0.7768 
## F-statistic: 88.01 on 2 and 48 DF,  p-value: < 2.2e-16
sat_mod3<- lm(csat ~ 1 + expense + house + senate,
                     data = na.omit(states_data))

summary(sat_mod3) %>% coef()
##                  Estimate   Std. Error    t value     Pr(>|t|)
## (Intercept) 1082.93438041 38.633812740 28.0307405 1.067795e-29
## expense       -0.01870832  0.009691494 -1.9303852 6.001998e-02
## house         -1.44243754  0.600478382 -2.4021473 2.058666e-02
## senate         0.49817861  0.513561356  0.9700469 3.373256e-01
# add the interaction to the model
sat_expense_by_percent <- lm(csat ~ 1 + expense + income +metro+ expense : income, data = states_data)
ggpredict(sat_expense_by_percent, terms = c("expense", "income"))
## 
## # Predicted values of csat
## # x = expense
## 
## # income = 27.44
## 
##    x | Predicted |    SE |            95% CI
## --------------------------------------------
## 2950 |   1002.63 | 23.98 | [955.62, 1049.63]
## 3900 |    982.74 | 15.56 | [952.23, 1013.24]
## 4850 |    962.84 | 14.70 | [934.03,  991.66]
## 5800 |    942.95 | 22.29 | [899.27,  986.64]
## 6750 |    923.06 | 32.93 | [858.52,  987.60]
## 8650 |    883.27 | 56.39 | [772.75,  993.79]
## 
## # income = 33.92
## 
##    x | Predicted |    SE |            95% CI
## --------------------------------------------
## 2950 |    970.23 | 22.40 | [926.33, 1014.13]
## 3900 |    957.07 | 14.46 | [928.73,  985.41]
## 4850 |    943.90 |  9.74 | [924.81,  962.99]
## 5800 |    930.74 | 12.61 | [906.02,  955.46]
## 6750 |    917.58 | 20.04 | [878.29,  956.86]
## 8650 |    891.25 | 37.72 | [817.33,  965.17]
## 
## # income = 40.4
## 
##    x | Predicted |    SE |            95% CI
## --------------------------------------------
## 2950 |    937.83 | 35.67 | [867.91, 1007.76]
## 3900 |    931.40 | 27.35 | [877.79,  985.01]
## 4850 |    924.97 | 19.91 | [885.94,  963.99]
## 5800 |    918.53 | 14.75 | [889.63,  947.43]
## 6750 |    912.10 | 14.53 | [883.61,  940.58]
## 8650 |    899.23 | 26.78 | [846.73,  951.72]
## 
## Adjusted for:
## * metro = 64.07
mydf<-ggpredict(sat_expense_by_percent, terms = c("expense", "income"))
ggplot(mydf, aes(x = x, y = predicted, colour = group)) +
  stat_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

mydf<-ggpredict(sat_expense_by_percent, terms = c("expense", "income","metro"))
ggplot(mydf, aes(x = x, y = predicted, colour = group)) +
  stat_smooth(method = "lm", se = FALSE) +
  facet_wrap(~facet, ncol = 2)
## `geom_smooth()` using formula 'y ~ x'

# same as above, but shorter syntax
sat_expense_by_percent <- lm(csat ~ 1 + expense * income, data = states_data) 

# show the regression coefficients table
summary(sat_expense_by_percent)
## 
## Call:
## lm(formula = csat ~ 1 + expense * income, data = states_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -133.725  -29.478   -5.475   26.267  134.190 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     1.380e+03  1.721e+02   8.021 2.37e-10 ***
## expense        -6.384e-02  3.270e-02  -1.952   0.0569 .  
## income         -1.050e+01  4.991e+00  -2.103   0.0408 *  
## expense:income  1.385e-03  8.636e-04   1.603   0.1155    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 57.75 on 47 degrees of freedom
## Multiple R-squared:  0.3002, Adjusted R-squared:  0.2555 
## F-statistic:  6.72 on 3 and 47 DF,  p-value: 0.0007246
levels(states_data$region)
## [1] "West"    "N. East" "South"   "Midwest"
sat_region <- lm(csat ~ 1 + region, data = states_data) 
summary(sat_region)
## 
## Call:
## lm(formula = csat ~ 1 + region, data = states_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -145.083  -29.389   -3.778   35.192   85.000 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     946.31      14.80  63.958  < 2e-16 ***
## regionN. East   -56.75      23.13  -2.453  0.01800 *  
## regionSouth     -16.31      19.92  -0.819  0.41719    
## regionMidwest    63.78      21.36   2.986  0.00451 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 53.35 on 46 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.3853, Adjusted R-squared:  0.3452 
## F-statistic:  9.61 on 3 and 46 DF,  p-value: 4.859e-05
states_data <- states_data %>%
  mutate(region = relevel(region, ref = "Midwest"))
mod_region <- lm(csat ~ 1 + region, data = states_data)
mod_region %>%
  emmeans(specs = pairwise ~ region)
## $emmeans
##  region  emmean   SE df lower.CL upper.CL
##  Midwest   1010 15.4 46      979     1041
##  West       946 14.8 46      917      976
##  N. East    890 17.8 46      854      925
##  South      930 13.3 46      903      957
## 
## Confidence level used: 0.95 
## 
## $contrasts
##  contrast          estimate   SE df t.ratio p.value
##  Midwest - West        63.8 21.4 46  2.986  0.0226 
##  Midwest - N. East    120.5 23.5 46  5.124  <.0001 
##  Midwest - South       80.1 20.4 46  3.931  0.0016 
##  West - N. East        56.8 23.1 46  2.453  0.0812 
##  West - South          16.3 19.9 46  0.819  0.8453 
##  N. East - South      -40.4 22.2 46 -1.820  0.2774 
## 
## P value adjustment: tukey method for comparing a family of 4 estimates
e1<-mod_region %>%
  emmeans(specs = pairwise ~ region)
plot(e1)

emmip(sat_expense_by_percent, expense ~ income, cov.reduce = range)

emmip(sat_expense_by_percent, income ~ expense, cov.reduce = range)

# print summary coefficients table
summary(mod_region) 
## 
## Call:
## lm(formula = csat ~ 1 + region, data = states_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -145.083  -29.389   -3.778   35.192   85.000 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    1010.08      15.40  65.590  < 2e-16 ***
## regionWest      -63.78      21.36  -2.986 0.004514 ** 
## regionN. East  -120.53      23.52  -5.124  5.8e-06 ***
## regionSouth     -80.08      20.37  -3.931 0.000283 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 53.35 on 46 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.3853, Adjusted R-squared:  0.3452 
## F-statistic:  9.61 on 3 and 46 DF,  p-value: 4.859e-05
#install.packages("devtools")
#devtools::install_github("neuropsychology/report")
#install.packages("flextable")
library(modelsummary)
## Warning: package 'modelsummary' was built under R version 4.0.3
modelsummary(sat_mod1, "table.docx")
models <- list(
  sat_mod1,
  sat_mod2 ,
  sat_mod3    
)
#modelsummary(models,"table3.docx",stars = TRUE, stars_note = TRUE)
modelsummary(models,stars = TRUE, stars_note = TRUE)
Model 1 Model 2 Model 3
(Intercept) 1060.732*** 989.807*** 1082.934***
(32.701) (18.396) (38.634)
expense -0.022*** 0.009** -0.019*
(0.006) (0.004) (0.010)
percent -2.538***
(0.225)
house -1.442**
(0.600)
senate 0.498
(0.514)
Num.Obs. 51 51 48
R2 0.217 0.786 0.283
R2 Adj. 0.201 0.777 0.234
AIC 566.0 501.9 532.3
BIC 571.8 509.7 541.6
Log.Lik. -279.999 -246.967 -261.127
F 13.615 88.009 5.780
* p < 0.1, ** p < 0.05, *** p < 0.01
data(efc)
fit <- lm(barthtot ~ c12hour + neg_c_7 + c161sex + c172code, data = efc)
mydf <- ggpredict(fit, terms = c("c12hour", "c161sex", "c172code"))
ggplot(mydf, aes(x = x, y = predicted, colour = group)) +
  stat_smooth(method = "lm", se = FALSE) +
  facet_wrap(~facet, ncol = 2)
## `geom_smooth()` using formula 'y ~ x'

NH11 <- read_rds("dataSets/NatHealth2011.rds")
levels(NH11$hypev)
## [1] "1 Yes"             "2 No"              "7 Refused"        
## [4] "8 Not ascertained" "9 Don't know"
NH11 <- NH11 %>%
  mutate(hypev = factor(hypev, levels=c("2 No", "1 Yes")))
hyp_out <- glm(hypev ~ 1 + age_p + sex + sleep + bmi,
               data = NH11, 
               family = binomial(link = "logit"))
glm(hypev ~ 1 + age_p + sex + sleep + bmi,
    data = NH11, 
    family = binomial(link = "logit"))
## 
## Call:  glm(formula = hypev ~ 1 + age_p + sex + sleep + bmi, family = binomial(link = "logit"), 
##     data = NH11)
## 
## Coefficients:
## (Intercept)        age_p  sex2 Female        sleep          bmi  
##   -4.269466     0.060699    -0.144025    -0.007036     0.018572  
## 
## Degrees of Freedom: 32967 Total (i.e. Null);  32963 Residual
##   (46 observations deleted due to missingness)
## Null Deviance:       41520 
## Residual Deviance: 34240     AIC: 34250
summary(hyp_out) %>% coef()
##                 Estimate   Std. Error    z value     Pr(>|z|)
## (Intercept) -4.269466028 0.0564947294 -75.572820 0.000000e+00
## age_p        0.060699303 0.0008227207  73.778743 0.000000e+00
## sex2 Female -0.144025092 0.0267976605  -5.374540 7.677854e-08
## sleep       -0.007035776 0.0016397197  -4.290841 1.779981e-05
## bmi          0.018571704 0.0009510828  19.526906 6.485172e-85
summary(hyp_out)
## 
## Call:
## glm(formula = hypev ~ 1 + age_p + sex + sleep + bmi, family = binomial(link = "logit"), 
##     data = NH11)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3507  -0.7869  -0.4801   0.9421   2.5987  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -4.2694660  0.0564947 -75.573  < 2e-16 ***
## age_p        0.0606993  0.0008227  73.779  < 2e-16 ***
## sex2 Female -0.1440251  0.0267977  -5.375 7.68e-08 ***
## sleep       -0.0070358  0.0016397  -4.291 1.78e-05 ***
## bmi          0.0185717  0.0009511  19.527  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 41515  on 32967  degrees of freedom
## Residual deviance: 34235  on 32963  degrees of freedom
##   (46 observations deleted due to missingness)
## AIC: 34245
## 
## Number of Fisher Scoring iterations: 4
coef(hyp_out) %>% exp() #OR 发生比
## (Intercept)       age_p sex2 Female       sleep         bmi 
##  0.01398925  1.06257935  0.86586602  0.99298892  1.01874523
hyp_out %>% 
  allEffects() %>%
  plot(type = "response") # "response" refers to the probability scale /effects 

hyp_out1 <- glm(hypev ~ 1 + age_p + sex + sleep + bmi,
                data = NH11, 
                family = binomial(link = "logit"))
hyp_out2 <- glm(hypev ~ 1 + age_p + sex + sleep ,
                data = NH11, 
                family = binomial(link = "logit"))
models1 <- list(
  hyp_out2,
  hyp_out1   
)
modelsummary(models1,stars = TRUE, stars_note = TRUE)
Model 1 Model 2
(Intercept) -3.734*** -4.269***
(0.048) (0.056)
age_p 0.060*** 0.061***
(0.001) (0.001)
sex2 Female -0.100*** -0.144***
(0.027) (0.027)
sleep 0.000 -0.007***
(0.002) (0.002)
bmi 0.019***
(0.001)
Num.Obs. 32968 32968
AIC 34628.8 34245.4
BIC 34662.4 34287.4
Log.Lik. -17310.393 -17117.676
* p < 0.1, ** p < 0.05, *** p < 0.01
modelsummary(models1,stars = TRUE, stars_note = TRUE,exponentiate = TRUE)#change to OR
Model 1 Model 2
(Intercept) 0.024*** 0.014***
(0.048) (0.056)
age_p 1.062*** 1.063***
(0.001) (0.001)
sex2 Female 0.905*** 0.866***
(0.027) (0.027)
sleep 1.000 0.993***
(0.002) (0.002)
bmi 1.019***
(0.001)
Num.Obs. 32968 32968
AIC 34628.8 34245.4
BIC 34662.4 34287.4
Log.Lik. -17310.393 -17117.676
* p < 0.1, ** p < 0.05, *** p < 0.01