regression Q1.Q2

Q1

pacman::p_load(HistData)
library(HistData)
example(Galton)

## 
## Galton> ## Don't show: 
## Galton>  # allow to work with car 1
## Galton>  if (packageDescription("car")[["Version"]] < 2) {
## Galton+      dataEllipse <- data.ellipse
## Galton+  }
## 
## Galton> ## End(Don't show)
## Galton> data(Galton)
## 
## Galton> ###########################################################################
## Galton> # sunflower plot with regression line and data ellipses and lowess smooth
## Galton> ###########################################################################
## Galton> 
## Galton> with(Galton, 
## Galton+  {
## Galton+  sunflowerplot(parent,child, xlim=c(62,74), ylim=c(62,74))
## Galton+  reg <- lm(child ~ parent)
## Galton+  abline(reg)
## Galton+  lines(lowess(parent, child), col="blue", lwd=2)
## Galton+  if(require(car)) {
## Galton+  dataEllipse(parent,child, xlim=c(62,74), ylim=c(62,74), plot.points=FALSE)
## Galton+      }
## Galton+   })

## Loading required package: car

m0 <- lm(child ~ 1, data = Galton); m0

## 
## Call:
## lm(formula = child ~ 1, data = Galton)
## 
## Coefficients:
## (Intercept)  
##       68.09

m1 <- update(m0, . ~ . + parent)
summary(m1)

## 
## Call:
## lm(formula = child ~ parent, data = Galton)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.8050 -1.3661  0.0487  1.6339  5.9264 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 23.94153    2.81088   8.517   <2e-16 ***
## parent       0.64629    0.04114  15.711   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.239 on 926 degrees of freedom
## Multiple R-squared:  0.2105, Adjusted R-squared:  0.2096 
## F-statistic: 246.8 on 1 and 926 DF,  p-value: < 2.2e-16

anova(m0, m1)

## Analysis of Variance Table
## 
## Model 1: child ~ 1
## Model 2: child ~ parent
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1    927 5877.2                                  
## 2    926 4640.3  1    1236.9 246.84 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

pacman::p_load(alr4)
library(alr4)
?Heights

## starting httpd help server ...

##  done

# Heights in inches of mothers and their daughters
ma <- lm(dheight ~ mheight, data = Heights) ; ma

## 
## Call:
## lm(formula = dheight ~ mheight, data = Heights)
## 
## Coefficients:
## (Intercept)      mheight  
##     29.9174       0.5417

summary(ma)

## 
## Call:
## lm(formula = dheight ~ mheight, data = Heights)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -7.397 -1.529  0.036  1.492  9.053 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 29.91744    1.62247   18.44   <2e-16 ***
## mheight      0.54175    0.02596   20.87   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.266 on 1373 degrees of freedom
## Multiple R-squared:  0.2408, Adjusted R-squared:  0.2402 
## F-statistic: 435.5 on 1 and 1373 DF,  p-value: < 2.2e-16

require(ggplot2)

## Loading required package: ggplot2

ggplot(ma, aes(sample = scale(resid(ma, "pearson")))) +
  stat_qq() +
  geom_abline(intercept = 0, slope = 1)+
  labs(x = "Normal quantiles",
       y = "Quantiles of standardized residuals")

Q2

#
# example of regression with sampling weights
#

# set some options
options(digits = 3, show.signif.stars = FALSE)

# set path to working directory
setwd("C:\\Users\\USER\\Desktop\\多層次\\w3 exercise")


# load packages
pacman::p_load(tidyverse)

# seed the random number generator to get the same sample
set.seed(6102)

require(dplyr)

# pick 72 countries from three regions 
# arrange the rows by alphabetical order
dta <- UN11 %>%
  filter(region %in% c("Africa", "Asia", "Europe")) %>%
  sample_n(72) %>%
  arrange(region)

# first 6 lines of data frame
head(dta)

##   region  group fertility ppgdp lifeExpF pctUrban
## 1 Africa africa      2.64  2654     75.5       44
## 2 Africa africa      4.74   737     63.2       28
## 3 Africa africa      2.38  7255     54.1       62
## 4 Africa africa      4.36  1131     61.0       42
## 5 Africa africa      4.62   802     59.2       23
## 6 Africa africa      4.98 16852     52.9       40

# data dimensions - rows and columns
dim(dta)

## [1] 72  6

# how many countries in each of the three regions
R3 <- table(dta$region)

# percentage of countries from each of the three regions selected
w <- R3/table(UN11$region)

# add the sampling weights variable to data
# skip over countries in regions not selected
dta$wt <- rep(1/w[w != 0], R3[R3 != 0])

# simple regression
m0 <- lm(fertility ~ log(ppgdp), data = dta)

# weighted regression
m1 <- update(m0, weights = wt)

# plot
ggplot(dta, aes(log(ppgdp), fertility, group = region, label = region)) +
  geom_text(check_overlap = TRUE, size = rel(2), col = "gray30")+
  stat_smooth(aes(group = 1), method = "lm", se = F, col = "gray30") +
  stat_smooth(method = "lm", se = F)+
  geom_abline(intercept = coef(m1)[1], slope = coef(m1)[2], col = "firebrick") +
  labs(x = "GDP (US$ in log units)", y = "Number of children per woman") +
  theme_bw()

regression Q1.Q2

Li

2017年10月22日

Q1

Q2