STAT 305 HW1

  #install.packages('car')
  #install.packages('corrplot')
  #install.packages('MPV')
  
  library(car)

## Loading required package: carData

  library(corrplot)

## corrplot 0.92 loaded

  library(MPV)

## Loading required package: lattice

## Loading required package: KernSmooth

## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009

## Loading required package: randomForest

## randomForest 4.7-1.1

## Type rfNews() to see new features/changes/bug fixes.

prestige <- data.frame(Prestige[,4],Prestige[,-c(4,6)])
  colnames(prestige)[1] <- 'prestige'
  
  plotlm <- function(lm.mod)
  {
    windows();
    quartz();
    par(mfrow=c(2,2))
    plot(lm.mod)
  }

Exercise 1 1.

  help(Prestige)

## starting httpd help server ... done

  pairs(prestige, panel = panel.smooth, main = 'Prestige Data Set', col=3)

  #The 'education' is the strongest predictors of the response since it is almost a straight line. There are some noise variables, such as 'income', 'women', and 'census'.

  corrplot(cor(prestige))

  #The darker the blue the higher the correlation, the darker the red the higher the negative correlation.

  lm.education <- lm(prestige~education, data = prestige)
  summary(lm.education)

## 
## Call:
## lm(formula = prestige ~ education, data = prestige)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -26.0397  -6.5228   0.6611   6.7430  18.1636 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -10.732      3.677  -2.919  0.00434 ** 
## education      5.361      0.332  16.148  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.103 on 100 degrees of freedom
## Multiple R-squared:  0.7228, Adjusted R-squared:   0.72 
## F-statistic: 260.8 on 1 and 100 DF,  p-value: < 2.2e-16

  par(mfrow=c(2,2))
  plot(lm.education)

The number of the data in this dataset is 102. The M-education is significant and it needs an intercept. Residuals are randomly scattered around zero without any clear trends. Since in the Normal Q-Q plot, it is almost a straight line, the residuals are normal distribution. The residuals are independent. In the Residual vs Leverage chart, it is homoscedastic. So, the model is fit.

   library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:randomForest':
## 
##     margin

   library(car)
  
   prestige <- data.frame(Prestige[,4], Prestige[,-c(4,6)])
   colnames(prestige)[1] <- 'prestige'
  

   lm.mod <- lm(prestige~income, data=prestige)
  
   xxnew <- data.frame(income=prestige$income)
   xnew <- prestige$income
   ynew <- prestige$prestige
  
   pred.band <- predict(lm.mod, xxnew, interval="prediction")
   conf.band <- predict(lm.mod, xxnew, interval="confidence")
  
   datadone <- data.frame(xnew,ynew, conf.band[,1], conf.band[,2], conf.band[,3], pred.band[,2], pred.band[,3])
   colnames(datadone) <- c('x', 'y', 'yhat', 'lc', 'uc', 'lp','up')
  
   p <- ggplot(datadone, aes(x, y)) +
    geom_point() +
    geom_line(color='black', aes(x=x, y=yhat))
  
   p + geom_line(aes(y = lc), color = "blue", linetype = "dotted")+
    geom_line(aes(y = uc), color = "blue", linetype = "dotted") +
    geom_line(aes(y = lp), color = "red", linetype = "dashed")+
    geom_line(aes(y = up), color = "red", linetype = "dashed")

   lm.mod <- lm(prestige~women, data=prestige)
  
   xxnew <- data.frame(women=prestige$women)
   xnew <- prestige$women
   ynew <- prestige$prestige
  
   pred.band <- predict(lm.mod, xxnew, interval="prediction")
   conf.band <- predict(lm.mod, xxnew, interval="confidence")
  
   datadone <- data.frame(xnew,ynew, conf.band[,1], conf.band[,2], conf.band[,3], pred.band[,2], pred.band[,3])
   colnames(datadone) <- c('x', 'y', 'yhat', 'lc', 'uc', 'lp','up')
  
   p <- ggplot(datadone, aes(x, y)) +
    geom_point() +
    geom_line(color='black', aes(x=x, y=yhat))
  
   p + geom_line(aes(y = lc), color = "blue", linetype = "dotted")+
    geom_line(aes(y = uc), color = "blue", linetype = "dotted") +
    geom_line(aes(y = lp), color = "red", linetype = "dashed")+
    geom_line(aes(y = up), color = "red", linetype = "dashed")

The confidence bands and the prediction bands of ‘income’ is smaller than the confidence bands and the prediction bands of ‘women’. But ‘income’ has some outliers and its range greater than the range of ‘women’.

  beta.0.hat <- coefficients(lm.education)[1]
  beta.1.hat <- coefficients(lm.education)[2]
  
  ese.beta_1_hat <- summary(lm.education)$coefficients[2,2]
  
  n <- nrow(prestige)
  p <- ncol(prestige)-1
  
  df <- n-2
  
  beta_1_0 <- 6
  
  t.beta_1_hat <- (beta.1.hat - beta_1_0)/ese.beta_1_hat
  
  alpha <- 0.1
  
  Pvalue <- pt(t.beta_1_hat, df=df)
  
  decision <- ifelse(Pvalue < alpha, 'Reject H_o', 'Do not reject H_o')
  
  cat('The Pvalue in this case is', Pvalue, '. Therefore', decision, '\n\n')

## The Pvalue in this case is 0.02852685 . Therefore Reject H_o

  decision

##    education 
## "Reject H_o"

  confint(lm.education, level = 0.9) #confidence interval

##                   5 %      95 %
## (Intercept) -16.83681 -4.627154
## education     4.80970  5.912056

  mu.hat.y.x <- predict(lm.education, newdata = data.frame(education=c(10)))
  mu.hat.y.x

##       1 
## 42.8768

  conf.mu.hat.y.x <- predict(lm.education, newdata = data.frame(education=c(10)), interval = "prediction")
  conf.mu.hat.y.x

##       fit     lwr      upr
## 1 42.8768 24.7213 61.03229

Exercise 2 1.

  library(MPV)
  help(cement)
  #The cement data frame has 13 rows and 5 columns. They are all numeric vectors. There is no strong correlation between explanatory variable(x) and response variable(y) in this data.

  pairs(cement, panel = panel.smooth, col=3)

  library(corrplot)
  corrplot(cor(cement))

x_1 and x_2 are positively correlated with y, and x_2 has a stronger correlation with y. x_3 and x_4 are negatively correlated with y, and x_4 has a stronger negative correlation.

  boxplot(cement)

x_1, x_3 have a small range, and x_1 is mostly concentrated in the middle of the range. x_2 mostly in the upper part of the range, and x_3, x_4 are mostly in the lower part of the range.

  summary(lm(y~., data=cement))

## 
## Call:
## lm(formula = y ~ ., data = cement)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.1750 -1.6709  0.2508  1.3783  3.9254 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  62.4054    70.0710   0.891   0.3991  
## x1            1.5511     0.7448   2.083   0.0708 .
## x2            0.5102     0.7238   0.705   0.5009  
## x3            0.1019     0.7547   0.135   0.8959  
## x4           -0.1441     0.7091  -0.203   0.8441  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.446 on 8 degrees of freedom
## Multiple R-squared:  0.9824, Adjusted R-squared:  0.9736 
## F-statistic: 111.5 on 4 and 8 DF,  p-value: 4.756e-07

The number of the data is 10. x_1 is significant. x_2, x_3, x_4, and intercept are not significant. Since the p-value is too small, reject H_o.

  plotlm <- function(lm.mod)
  {
    op <- par(mfrow=c(2,2))
    plot(lm.mod)
    par(op)
  }

  plotlm(lm(y~., data=cement))

Residuals are randomly scattered around zero without any clear trends. Since in the Normal Q-Q plot, it is almost a straight line, the residuals are normal distribution. The residuals are independent. In the Residual vs Leverage chart, it is fanning in. So, the model is fit.

  shapiro.test(resid(lm(y~., data=cement)))

## 
##  Shapiro-Wilk normality test
## 
## data:  resid(lm(y ~ ., data = cement))
## W = 0.96967, p-value = 0.8903

H_0: the residuals are normally distribution H_1: the residuals are not normally distribution

w = 0.96967, it is close to 1, the data is a normal distribution. p-value = 0.8903, cannot reject H_o, we have sufficient evidence that the residuals are normally distribution.

Exercise 3 1. (a) #Since (x-x_bar)^2 is always positive, when (x-x_bar)^2 = 0 we can get the lowest leverage equals to 1/n. The lowest leverage can be obtained at the point (x_bar, y_bar).

#Sxx = (n-1)Sx^2, Sx is the standard deviation #x-x_bar = +-3Sx #leverage(x) = 1/n + (x* - x_bar)^2/Sxx = 1/n + (+-3Sx)^2/((n-1)Sx2) = 1/n + (9 Sx^2)/((n-1)Sx2) = 1/n + 9/(n-1) = 10/n
#Leverage is highest when (x-x_bar)^2 is highest. The highest leverage occurs when the x has the largest distance from x_bar.

#z score = (-2.33-0)/1 = -2.33
  p_sresidual_less_than_minus_2.33 <- pnorm(-2.33)
  p_sresidual_less_than_minus_2.33

## [1] 0.009903076

When hii is close to 1, the ̃ri is close to 0.

STAT 305 HW1

Shumei Liu

2023-09-21