Pres 2 Final.R

Scott Callahan — Nov 19, 2013, 9:45 PM

##Determinants of Literacy Rate
#========================================================
#  author: Alan Adelman, Scott Callahan, Omer Kutlubay
#date: Presentation 2

#Running the regression with all variables
#========================================================
library(ggplot2)

Warning: package 'ggplot2' was built under R version 3.0.2

ltrdata10 <- read.csv("~/Group_project-figure/ltrdata10.csv")
ltrdata10$GDP <-log(ltrdata10$GDP)
View(ltrdata10)
row.names(ltrdata10) <- ltrdata10$Country.Name
ltrdata10$Country.Name <- NULL
ltrdata10$Region <- factor(ltrdata10$Region)
levels(ltrdata10$Region) <- c("Africa", "Asia", "Oceanic", "C. America", "Europe","Mid. East", "N. America", "S. America")
## Locate all 0's in the data set and set them to NA
ltrdata10[ltrdata10 == 0] <- NA
fit <- lm(LTR ~ GDP+ UNEMP+ URBGR  + MOBILE  + LEXP + INTERNET , data = ltrdata10)

(sse <- deviance(fit))

[1] 0.7454

(df.residual(fit))

[1] 88


summary(fit)


Call:
lm(formula = LTR ~ GDP + UNEMP + URBGR + MOBILE + LEXP + INTERNET, 
    data = ltrdata10)

Residuals:
    Min      1Q  Median      3Q     Max 
-0.3242 -0.0469  0.0023  0.0365  0.2274 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)   
(Intercept)  0.064632   0.149502    0.43   0.6666   
GDP          0.034831   0.013625    2.56   0.0123 * 
UNEMP        0.239559   0.117059    2.05   0.0437 * 
URBGR       -1.274271   0.552218   -2.31   0.0234 * 
MOBILE       0.000920   0.000317    2.90   0.0047 **
LEXP         0.006380   0.001879    3.40   0.0010 **
INTERNET    -0.000652   0.000742   -0.88   0.3819   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.092 on 88 degrees of freedom
Multiple R-squared:  0.635, Adjusted R-squared:  0.61 
F-statistic: 25.5 on 6 and 88 DF,  p-value: <2e-16

options(show.signif.stars=FALSE, digits=3)
#==================================
#Remove the INTERNET variable

fit2 <- lm(LTR ~ GDP + UNEMP + URBGR + MOBILE + LEXP, ltrdata10)
(sse2 <- deviance(fit2))

[1] 0.752

(fstat <- (deviance(fit2)-deviance(fit))/(deviance(fit)/df.residual(fit)))

[1] 0.772

1-pf(fstat,1,df.residual(fit)) # p-value

[1] 0.382



sqrt(fstat)

[1] 0.879

(tstat <- summary(fit)$coef[2,3])

[1] 2.56

2*(1-pt(abs(tstat),45)) # p-value

[1] 0.014

summary(fit2)


Call:
lm(formula = LTR ~ GDP + UNEMP + URBGR + MOBILE + LEXP, data = ltrdata10)

Residuals:
    Min      1Q  Median      3Q     Max 
-0.3301 -0.0451  0.0015  0.0345  0.2240 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)
(Intercept)  0.156962   0.106225    1.48   0.1430
GDP          0.027269   0.010551    2.58   0.0114
UNEMP        0.223462   0.115469    1.94   0.0561
URBGR       -1.292914   0.551104   -2.35   0.0212
MOBILE       0.000900   0.000316    2.85   0.0055
LEXP         0.005727   0.001723    3.32   0.0013

Residual standard error: 0.0919 on 89 degrees of freedom
Multiple R-squared:  0.631, Adjusted R-squared:  0.611 
F-statistic: 30.5 on 5 and 89 DF,  p-value: <2e-16

anova(fit2,fit)

Analysis of Variance Table

Model 1: LTR ~ GDP + UNEMP + URBGR + MOBILE + LEXP
Model 2: LTR ~ GDP + UNEMP + URBGR + MOBILE + LEXP + INTERNET
  Res.Df   RSS Df Sum of Sq    F Pr(>F)
1     89 0.752                         
2     88 0.745  1   0.00654 0.77   0.38


#Remove GDP
fit3 <- lm(LTR ~ UNEMP+ URBGR + MOBILE + LEXP + INTERNET, ltrdata10)
anova(fit3, fit)

Analysis of Variance Table

Model 1: LTR ~ UNEMP + URBGR + MOBILE + LEXP + INTERNET
Model 2: LTR ~ GDP + UNEMP + URBGR + MOBILE + LEXP + INTERNET
  Res.Df   RSS Df Sum of Sq    F Pr(>F)
1     89 0.801                         
2     88 0.745  1    0.0554 6.53  0.012


#we picked the larger model, but wont use INTERNET because of multicollinearity with GDP

#Residuals
par(mfrow=c(1,1), bg="white")

rf <- lm(fit$residuals~fit$fitted.values)
plot(fit$residuals~fit$fitted.values,type="h",col="darkblue",cex.main = 1.0, font.main = 2,main="Residuals vs Fitted", xlab="Fitted Values", ylab="Residuals")
abline(rf)

plot of chunk unnamed-chunk-1


rf2 <- lm(fit2$residuals~fit2$fitted.values)
plot(fit2$residuals~fit2$fitted.values,type="h",col="darkred", cex.main = 1.0, font.main = 2,main="Residuals vs Fitted 2", xlab="Fitted Values", ylab="Residuals")
abline(rf2)

plot of chunk unnamed-chunk-1


rf3 <- lm(fit3$residuals~fit3$fitted.values)
plot(fit3$residuals~fit3$fitted.values,type="h", col="black",cex.main = 1.0, font.main = 2,main="Residuals vs Fitted 3", xlab="Fitted Values", ylab="Residuals")
abline(rf3)

plot of chunk unnamed-chunk-1


#We use this as our model
g <- lm(LTR ~ GDP +UNEMP+ URBGR + MOBILE + LEXP , ltrdata10)
summary(g)


Call:
lm(formula = LTR ~ GDP + UNEMP + URBGR + MOBILE + LEXP, data = ltrdata10)

Residuals:
    Min      1Q  Median      3Q     Max 
-0.3301 -0.0451  0.0015  0.0345  0.2240 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)
(Intercept)  0.156962   0.106225    1.48   0.1430
GDP          0.027269   0.010551    2.58   0.0114
UNEMP        0.223462   0.115469    1.94   0.0561
URBGR       -1.292914   0.551104   -2.35   0.0212
MOBILE       0.000900   0.000316    2.85   0.0055
LEXP         0.005727   0.001723    3.32   0.0013

Residual standard error: 0.0919 on 89 degrees of freedom
Multiple R-squared:  0.631, Adjusted R-squared:  0.611 
F-statistic: 30.5 on 5 and 89 DF,  p-value: <2e-16


#Calculating CI for GDP the hard way

alpha <- (1-.95)
qt(1-alpha/2,df.residual(g))

[1] 1.99



c(.027269-1.99*.010551, .027269+1.99*.010551)

[1] 0.00627 0.04827


#95% confidece interval
#============================
  #one-at-a-time

confint(g)

                2.5 %   97.5 %
(Intercept) -0.054105  0.36803
GDP          0.006304  0.04823
UNEMP       -0.005972  0.45290
URBGR       -2.387946 -0.19788
MOBILE       0.000271  0.00153
LEXP         0.002303  0.00915


#99% confidence interval

  #one-at-a-time
confint(g, level = 0.99)

                0.5 %  99.5 %
(Intercept) -1.23e-01 0.43657
GDP         -5.04e-04 0.05504
UNEMP       -8.05e-02 0.52740
URBGR       -2.74e+00 0.15771
MOBILE       6.74e-05 0.00173
LEXP         1.19e-03 0.01026

#==================================
library(ellipse)

Warning: package 'ellipse' was built under R version 3.0.2


plot(ellipse(g,c(2,3)),type="l",xlim=c(-.01,.07))

points(0,0)

points(coef(g)[2], coef(g)[3], pch=18)

abline(v=confint(g)[2,], lty=2)

abline(h=confint(g)[3,], lty=2)

plot of chunk unnamed-chunk-1


#================================
cor(ltrdata10$URBGR, ltrdata10$MOBILE)

[1] -0.311


summary(g, corr=TRUE)$corr

            (Intercept)     GDP   UNEMP   URBGR  MOBILE   LEXP
(Intercept)       1.000 -0.1216 -0.5078 -0.5028  0.2113 -0.726
GDP              -0.122  1.0000 -0.1223 -0.0512 -0.3864 -0.531
UNEMP            -0.508 -0.1223  1.0000  0.2508  0.0409  0.395
URBGR            -0.503 -0.0512  0.2508  1.0000  0.1151  0.328
MOBILE            0.211 -0.3864  0.0409  0.1151  1.0000 -0.163
LEXP             -0.726 -0.5305  0.3949  0.3276 -0.1629  1.000


#==================================

plot(ellipse(g,c(4,5)),type="l",xlim=c(-3,.1), ylim=c(-.0005, .002))

points(0,0)

points(coef(g)[4], coef(g)[5], pch=18)

abline(v=confint(g)[4,], lty=2)

abline(h=confint(g)[5,], lty=2)

plot of chunk unnamed-chunk-1


#Setting up for Predition Intervals
#====================================================

g <- lm(LTR ~ GDP+UNEMP+ URBGR + MOBILE + LEXP , ltrdata10)

x0 <- data.frame(URBGR=0.03,UNEMP=.04, MOBILE=75, LEXP=50 ,GDP=8)

str(predict(g,x0,se=TRUE))

List of 4
 $ fit           : Named num 0.699
  ..- attr(*, "names")= chr "1"
 $ se.fit        : num 0.0353
 $ df            : int 89
 $ residual.scale: num 0.0919


#Prediction Intervals
#========================
  #Mean Response (Average)
predict(g,x0,interval="confidence", level=.95)

    fit   lwr   upr
1 0.699 0.629 0.769


#New Response (One-at-a-time)

predict(g,x0,interval="prediction")

    fit   lwr   upr
1 0.699 0.503 0.895

#======================================

grid <- seq(0,300,1)
p <- predict(g, data.frame(URBGR=0.03,UNEMP=.04, MOBILE=grid, LEXP=50 ,GDP=8), se=T, interval="confidence")

matplot(grid, p$fit, lty=c(1,2,2), type="l",xlab="Mobile Subscription Rate", ylab="Literacy Rate")

rug(ltrdata10$MOBILE)

plot of chunk unnamed-chunk-1


# 3D Scatterplot

library(scatterplot3d)

data(ltrdata10)

Warning: data set 'ltrdata10' not found


attach(ltrdata10)

scatterplot3d(GDP,MOBILE,LTR, main="3D Scatterplot")

plot of chunk unnamed-chunk-1


s3d <- scatterplot3d(GDP,MOBILE,LTR, pch=16, highlight.3d=TRUE, type="h", main="3D Scatterplot")

fit <- lm(LTR ~ GDP + MOBILE)

s3d$plane3d(fit)

plot of chunk unnamed-chunk-1


scatterplot3d(UNEMP,GDP,LTR, main="3D Scatterplot")

plot of chunk unnamed-chunk-1


s3d <- scatterplot3d(UNEMP,GDP, LTR, pch=16, highlight.3d=TRUE, type="h", main="3D Scatterplot")

fit2 <- lm(LTR ~UNEMP + GDP)

s3d$plane3d(fit2)

plot of chunk unnamed-chunk-1


library(rgl)

Warning: package 'rgl' was built under R version 3.0.2


plot3d(GDP,MOBILE,LTR, col = rainbow(97, end = 5/6)[rank(LTR)], size = 11)

detach(ltrdata10)