20161024.r

#1#
data(teengamb,package = "faraway")


my_data <- teengamb
head(my_data)

##   sex status income verbal gamble
## 1   1     51   2.00      8    0.0
## 2   1     28   2.50      8    0.0
## 3   1     37   2.00      6    0.0
## 4   1     28   7.00      4    7.3
## 5   1     65   2.00      8   19.6
## 6   1     61   3.47      6    0.1

l_model <- lm(gamble~ sex+ status +income +verbal , my_data)

summary(l_model)

## 
## Call:
## lm(formula = gamble ~ sex + status + income + verbal, data = my_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -51.082 -11.320  -1.451   9.452  94.252 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  22.55565   17.19680   1.312   0.1968    
## sex         -22.11833    8.21111  -2.694   0.0101 *  
## status        0.05223    0.28111   0.186   0.8535    
## income        4.96198    1.02539   4.839 1.79e-05 ***
## verbal       -2.95949    2.17215  -1.362   0.1803    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22.69 on 42 degrees of freedom
## Multiple R-squared:  0.5267, Adjusted R-squared:  0.4816 
## F-statistic: 11.69 on 4 and 42 DF,  p-value: 1.815e-06

#which variables are statistically significant at the 5% level 
data.frame(summary(l_model)$coef[summary(l_model)$coef[,4]<=0.05,4])

##        summary.l_model..coef.summary.l_model..coef...4.....0.05..4.
## sex                                                    1.011184e-02
## income                                                 1.791882e-05

#2#
#what interpretation should be given to the coefficient for sex. 

#If all the predictors except sex are held constant, the difference in predicted expenditure on gambling between
#male (sex=0) and female (sex=1) will be equal to the regression coefficient of sex, i.e., −22.11833. Therefore
#whenever sex changes from male (sex=0) to female (sex=1), the value of gamble decreases by 22.11833. In
#other words, according to the current regression model, a female spends $22.11833 less than a comparable
#(i.e., other predictors being held constant) male on gambling.



#3#
#fit a model with just income, .......
l_model_two <- lm(gamble~income,my_data)
summary(l_model_two)

## 
## Call:
## lm(formula = gamble ~ income, data = my_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -46.020 -11.874  -3.757  11.934 107.120 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -6.325      6.030  -1.049      0.3    
## income         5.520      1.036   5.330 3.05e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24.95 on 45 degrees of freedom
## Multiple R-squared:  0.387,  Adjusted R-squared:  0.3734 
## F-statistic: 28.41 on 1 and 45 DF,  p-value: 3.045e-06

#use F test to compare. 

#Two linear models are Nested if one (the restricted model) is
#obtained from the other (the full model) by setting some
#parameters to zero (i.e. removing terms from the model), or
#some other constraint on the parameters.
#We can compare nested models fit to the same dataset with the
#F test.
anova(l_model_two,l_model)

## Analysis of Variance Table
## 
## Model 1: gamble ~ income
## Model 2: gamble ~ sex + status + income + verbal
##   Res.Df   RSS Df Sum of Sq      F  Pr(>F)  
## 1     45 28009                              
## 2     42 21624  3    6384.8 4.1338 0.01177 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#null hypo is Beta_sex = Beta_status = Beta_verbal= 0 
# we can see the P-value is 0.01177, 
#it indicate that null can be rejected here and the proposed
#simplification of model is NOT justifiable. 





#.2.3

###(a)####
x <- c(6,6.3,6.5,6.8,7.0,7.1,7.5,7.5,7.6,7.8,8.0,8.2,8.4,8.4,8.9)

x

##  [1] 6.0 6.3 6.5 6.8 7.0 7.1 7.5 7.5 7.6 7.8 8.0 8.2 8.4 8.4 8.9

y <- c(39,58,49,53,80,86,115,124,104,131,147,160,156,172,180)

y

##  [1]  39  58  49  53  80  86 115 124 104 131 147 160 156 172 180

#l_model <- lm(y~x)
#summary(l_model)
plot(x, y, pch = 16, cex = 1.3, col = "blue", main = "(a) linear", xlab = "X", ylab = "Y")
abline(lm(y~x),col="red")

###(b)###
m_parabola <- lm(y ~ poly(x,2))
m_linear <- lm(y~x)
summary(m_parabola)

## 
## Call:
## lm(formula = y ~ poly(x, 2))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -20.209  -7.213   3.230   8.746  12.921 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  110.267      2.836  38.884 5.40e-14 ***
## poly(x, 2)1  173.646     10.983  15.811 2.13e-09 ***
## poly(x, 2)2    4.070     10.983   0.371    0.717    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.98 on 12 degrees of freedom
## Multiple R-squared:  0.9542, Adjusted R-squared:  0.9466 
## F-statistic: 125.1 on 2 and 12 DF,  p-value: 9.208e-09

summary(m_linear)

## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -20.647  -6.851   2.423   9.374  11.902 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -299.877     25.215  -11.89 2.33e-08 ***
## x             54.930      3.357   16.36 4.70e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.61 on 13 degrees of freedom
## Multiple R-squared:  0.9537, Adjusted R-squared:  0.9501 
## F-statistic: 267.7 on 1 and 13 DF,  p-value: 4.701e-10

#fitted parabola 
xx <- seq(from=6,to=9,by=0.05)
yy <- 110.267+173.646*xx +4.070*(xx^2)


plot(x, y, pch = 16, cex = 1.3, col = "blue", main = "(b) non linear", xlab = "X", ylab = "Y")

lines(x, predict(m_parabola), col = "red")

###2.14####

#input the points 

#(a)
x <- c(0,0,0,0,20,20,20,20,40,40,40,40,60,60,60,60,80,80,80,80,100,100,100,100)

y <- c(0.727,0.721,0.742,0.746,0.884,0.880,0.885,0.890,1.073,1.050,1.045,1.033,1.194,
       1.184,1.205,1.180,1.350,1.291,1.291,1.323,1.442,1.369,1.458,1.459)
x

##  [1]   0   0   0   0  20  20  20  20  40  40  40  40  60  60  60  60  80
## [18]  80  80  80 100 100 100 100

##  [1] 0.727 0.721 0.742 0.746 0.884 0.880 0.885 0.890 1.073 1.050 1.045
## [12] 1.033 1.194 1.184 1.205 1.180 1.350 1.291 1.291 1.323 1.442 1.369
## [23] 1.458 1.459

plot(x, y, pch = "*", cex = 1.3, col = "black", main = "#2.14(a)# linear", xlab = "X", ylab = "Y")

matrix_y <- matrix(y,nrow=4)
matrix_y

##       [,1]  [,2]  [,3]  [,4]  [,5]  [,6]
## [1,] 0.727 0.884 1.073 1.194 1.350 1.442
## [2,] 0.721 0.880 1.050 1.184 1.291 1.369
## [3,] 0.742 0.885 1.045 1.205 1.291 1.458
## [4,] 0.746 0.890 1.033 1.180 1.323 1.459

y_model <- apply(X = matrix_y,MARGIN = 2,FUN=mean)
x_model <- seq(0,100,20)

y_model

## [1] 0.73400 0.88475 1.05025 1.19075 1.31375 1.43200

x_model

## [1]   0  20  40  60  80 100

points(x_model,y_model,col="red",pch="o")

lin_model <- lm(y_model~x_model)
abline(lin_model,col='blue')

summary(lin_model)

## 
## Call:
## lm(formula = y_model ~ x_model)
## 
## Residuals:
##         1         2         3         4         5         6 
## -0.015667 -0.005417  0.019583  0.019583  0.002083 -0.020167 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.7496667  0.0137932   54.35 6.86e-07 ***
## x_model     0.0070250  0.0002278   30.84 6.59e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01906 on 4 degrees of freedom
## Multiple R-squared:  0.9958, Adjusted R-squared:  0.9948 
## F-statistic: 951.1 on 1 and 4 DF,  p-value: 6.586e-06

lin_model$coefficients

## (Intercept)     x_model 
##   0.7496667   0.0070250

###c###

parabola_model <- lm(y_model ~ poly(x_model,2))


plot(x, y, pch = "*", cex = 1.3, col = "black", main = "#2.14(c)# parabola", xlab = "X", ylab = "Y")
points(x_model,y_model,col="red",pch="o")

lines(x_model, predict(parabola_model), col = "blue")

summary(parabola_model)

## 
## Call:
## lm(formula = y_model ~ poly(x_model, 2))
## 
## Residuals:
##         1         2         3         4         5         6 
##  0.004125 -0.009375  0.003750  0.003750 -0.001875 -0.000375 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        1.100917   0.002756 399.509 3.46e-08 ***
## poly(x_model, 2)1  0.587754   0.006750  87.075 3.34e-06 ***
## poly(x_model, 2)2 -0.036279   0.006750  -5.375   0.0126 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.00675 on 3 degrees of freedom
## Multiple R-squared:  0.9996, Adjusted R-squared:  0.9993 
## F-statistic:  3805 on 2 and 3 DF,  p-value: 7.821e-06

parabola_model$coefficients

##       (Intercept) poly(x_model, 2)1 poly(x_model, 2)2 
##        1.10091667        0.58775367       -0.03627872

###e###
#get x square standard error 

x_suqare_SE <- summary(parabola_model)$coefficients[,2][3]

x_square_coefficient <- summary(parabola_model)$coefficients[,1][3]

t_statistic <- (x_square_coefficient-0)/x_suqare_SE

2 * pt(abs(t_statistic), parabola_model$df.residual, lower.tail = FALSE)

## poly(x_model, 2)2 
##        0.01261207

20161024.r

Andy

Wed Oct 26 01:40:30 2016