This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

Read file

setwd("C:/Users/Yuan/Desktop/R/5")
USPopulation=read.csv(file="USPOPULATION.csv",header=TRUE, sep=",")

Distribution

hist(USPopulation$Population,100,col="black")

hist(USPopulation$Population, breaks=10)

qqnorm(USPopulation$Population)

shapiro.test(USPopulation$Population)

## 
##  Shapiro-Wilk normality test
## 
## data:  USPopulation$Population
## W = 0.88825, p-value = 0.01742

non-linear regression

cor(USPopulation, use="complete.obs", method="pearson")

##            Population      Year    YearSq
## Population  1.0000000 0.9589847 0.9630800
## Year        0.9589847 1.0000000 0.9998886
## YearSq      0.9630800 0.9998886 1.0000000

cov(USPopulation, use="complete.obs")

##              Population         Year      YearSq
## Population     7604.771     5430.495    20671769
## Year           5430.495     4216.667    15981167
## YearSq     20671769.252 15981166.667 60582115000

cor.test(USPopulation$Population, USPopulation$Year, method = "pearson")

## 
##  Pearson's product-moment correlation
## 
## data:  USPopulation$Population and USPopulation$Year
## t = 15.13, df = 20, p-value = 2.052e-12
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9021165 0.9831072
## sample estimates:
##       cor 
## 0.9589847

plot(USPopulation$Population, USPopulation$Year, xlab="population", ylab="year", pch=21)

abline(lm(Population ~ Year, data=USPopulation))

#simple linear regression

model=lm(USPopulation$Population ~ USPopulation$Year)
model

## 
## Call:
## lm(formula = USPopulation$Population ~ USPopulation$Year)
## 
## Coefficients:
##       (Intercept)  USPopulation$Year  
##         -2345.855              1.288

summary(model)

## 
## Call:
## lm(formula = USPopulation$Population ~ USPopulation$Year)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -25.26 -21.08 -10.12  19.54  51.55 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -2.346e+03  1.614e+02  -14.54 4.29e-12 ***
## USPopulation$Year  1.288e+00  8.512e-02   15.13 2.05e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 25.33 on 20 degrees of freedom
## Multiple R-squared:  0.9197, Adjusted R-squared:  0.9156 
## F-statistic: 228.9 on 1 and 20 DF,  p-value: 2.052e-12

plot(USPopulation$Population ~ USPopulation$Year)
abline(model)

predict(model, newdata=data.frame(Units=4), interval = "confidence")

## Warning: 'newdata' had 1 row but variables found have 22 rows

##           fit        lwr        upr
## 1  -40.577759 -62.360203 -18.795315
## 2  -27.699115 -47.982649  -7.415581
## 3  -14.820471 -33.653260   4.012317
## 4   -1.941828 -19.384058  15.500403
## 5   10.936816  -5.190620  27.064252
## 6   23.815460   8.906995  38.723924
## 7   36.694103  22.883392  50.504815
## 8   49.572747  36.707503  62.437991
## 9   62.451391  50.343602  74.559180
## 10  75.330034  63.754719  86.905350
## 11  88.208678  76.909005  99.508351
## 12 101.087322  89.787649 112.386995
## 13 113.965966 102.390650 125.541281
## 14 126.844609 114.736820 138.952398
## 15 139.723253 126.858009 152.588497
## 16 152.601897 138.791185 166.412608
## 17 165.480540 150.572076 180.389005
## 18 178.359184 162.231748 194.486620
## 19 191.237828 173.795597 208.680058
## 20 204.116471 185.283683 222.949260
## 21 216.995115 196.711581 237.278649
## 22 229.873759 208.091315 251.656203

#residual
model.res = resid(model)
plot(USPopulation$Population, model.res)
abline(0, 0)

#influence
influence.measures(model)

## Influence measures of
##   lm(formula = USPopulation$Population ~ USPopulation$Year) :
## 
##      dfb.1_ dfb.USP.   dffit cov.r   cook.d    hat inf
## 1   0.82280 -0.80694  0.9428 0.885 0.380821 0.1700    
## 2   0.51210 -0.50118  0.6027 1.054 0.172123 0.1474    
## 3   0.29093 -0.28399  0.3544 1.161 0.063225 0.1270    
## 4   0.13031 -0.12679  0.1661 1.215 0.014344 0.1090    
## 5   0.01846 -0.01788  0.0250 1.221 0.000329 0.0932    
## 6  -0.05422  0.05223 -0.0797 1.195 0.003334 0.0796    
## 7  -0.08895  0.08499 -0.1469 1.153 0.011185 0.0683    
## 8  -0.09375  0.08844 -0.1831 1.115 0.017162 0.0593    
## 9  -0.08556  0.07891 -0.2152 1.073 0.023353 0.0525    
## 10 -0.06010  0.05268 -0.2289 1.046 0.026157 0.0480    
## 11 -0.02503  0.01758 -0.2238 1.043 0.024979 0.0457    
## 12  0.01004 -0.01746 -0.2222 1.045 0.024647 0.0457    
## 13  0.03923 -0.04573 -0.1987 1.074 0.019964 0.0480    
## 14  0.06719 -0.07347 -0.2004 1.085 0.020362 0.0525    
## 15  0.07748 -0.08253 -0.1709 1.122 0.014998 0.0593    
## 16  0.12685 -0.13321 -0.2302 1.104 0.026879 0.0683    
## 17  0.10601 -0.11033 -0.1684 1.163 0.014677 0.0796    
## 18 -0.00864  0.00893  0.0125 1.222 0.000082 0.0932    
## 19 -0.12736  0.13115  0.1718 1.213 0.015336 0.1090    
## 20 -0.28155  0.28893  0.3605 1.158 0.065341 0.1270    
## 21 -0.46854  0.47953  0.5766 1.072 0.158909 0.1474    
## 22 -0.95312  0.97334  1.1372 0.752 0.510853 0.1700   *

multiple linear regression

fit=lm(Population ~ Year+YearSq,data=USPopulation)
summary(fit)

## 
## Call:
## lm(formula = Population ~ Year + YearSq, data = USPopulation)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.5642 -0.3134  0.5894  1.4263  4.7578 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.163e+04  6.395e+02   33.83   <2e-16 ***
## Year        -2.405e+01  6.755e-01  -35.60   <2e-16 ***
## YearSq       6.684e-03  1.782e-04   37.51   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3 on 19 degrees of freedom
## Multiple R-squared:  0.9989, Adjusted R-squared:  0.9988 
## F-statistic:  8864 on 2 and 19 DF,  p-value: < 2.2e-16

library(usdm)

## Loading required package: sp

## Loading required package: raster

library(car)

## 
## Attaching package: 'car'

## The following object is masked from 'package:usdm':
## 
##     vif

vif(fit)

##     Year   YearSq 
## 4489.781 4489.781

Generating Predicted Values and Confidence Intervals

setwd("C:/Users/Yuan/Desktop/R/5")
PRED=read.csv(file="PRED.csv",header=TRUE, sep=",")

#attach(PRED)
#fit.lm=lm(Population ~ Year+YearSq)
#newdata=data.frame(Year=1925, + YearSq=3705625)
#predict(fit.lm, newdata, interval="predict")
#detach(PRED)
#ERROR happens when doing this...

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

hw5code

Distribution

non-linear regression

multiple linear regression

Generating Predicted Values and Confidence Intervals