Advanced Geographical Statistics

Assignment 5

GEOG 6000

Date: 10/21/2015

Sulochan Dhungel

\(Answer 1\)

setwd("C:/Users/Sulochan/Copy/Fall 2015/Advance Geo/Assignment 5")
GlbTemp = read.csv("GlobalTemp.csv")
head(GlbTemp)
##   Year   NHTemp   SHTemp
## 1 1000 -0.27668 -0.45604
## 2 1001 -0.23782 -0.40895
## 3 1002 -0.20315 -0.37730
## 4 1003 -0.17683 -0.35891
## 5 1004 -0.16292 -0.34768
## 6 1005 -0.16467 -0.33543
# NHTemp & SHTemp => annual temperature for the Northern and Southern Hemispheres

summary(GlbTemp)
##       Year          NHTemp            SHTemp       
##  Min.   :1000   Min.   :-1.0527   Min.   :-0.9452  
##  1st Qu.:1249   1st Qu.:-0.4997   1st Qu.:-0.4482  
##  Median :1498   Median :-0.3815   Median :-0.3234  
##  Mean   :1498   Mean   :-0.3804   Mean   :-0.3175  
##  3rd Qu.:1746   3rd Qu.:-0.2734   3rd Qu.:-0.1936  
##  Max.   :1995   Max.   : 0.2534   Max.   : 0.1904
#boxplot(GlbTemp$NHTemp, GlbTemp$SHTemp)

plot(1:dim(GlbTemp)[1], GlbTemp$NHTemp, "l", lwd = 2, col = "red", xlab = "Years", ylab = "Temp relative to Avg 1961-1990")
grid()
abline(h=mean(GlbTemp$NHTemp,na.rm=T), col = "red", lty=2)
lines(1:dim(GlbTemp)[1], GlbTemp$SHTemp, "l", lwd = 2, col = "blue")
abline(h=mean(GlbTemp$SHTemp,na.rm=T), col = "blue", lty=2)
legend("topleft",legend=c("N Hem","S Hem"), lty=1,lwd =2, col=c("red","blue"), bty="n")

NHTemp.loess1 <- loess(NHTemp ~ Year, data=GlbTemp, span=0.6)
summary(NHTemp.loess1)
## Call:
## loess(formula = NHTemp ~ Year, data = GlbTemp, span = 0.6)
## 
## Number of Observations: 996 
## Equivalent Number of Parameters: 5.28 
## Residual Standard Error: 0.1585 
## Trace of smoother matrix: 5.77 
## 
## Control settings:
##   normalize:  TRUE 
##   span       :  0.6 
##   degree   :  2 
##   family   :  gaussian
##   surface  :  interpolate      cell = 0.2
SHTemp.loess1 <- loess(SHTemp ~ Year, data=GlbTemp, span=0.6)
summary(SHTemp.loess1)
## Call:
## loess(formula = SHTemp ~ Year, data = GlbTemp, span = 0.6)
## 
## Number of Observations: 996 
## Equivalent Number of Parameters: 5.28 
## Residual Standard Error: 0.1779 
## Trace of smoother matrix: 5.77 
## 
## Control settings:
##   normalize:  TRUE 
##   span       :  0.6 
##   degree   :  2 
##   family   :  gaussian
##   surface  :  interpolate      cell = 0.2
Std_error_NHtemp = summary(NHTemp.loess1)$s
Std_error_SHtemp = summary(SHTemp.loess1)$s

Residual Error for North Hemisphere Temp Loess model = 0.1584804
Residual Error for South Hemisphere Temp Loess model = 0.1779037

newyrs <- data.frame(Year=seq(1000,2000,by=1))
newNHTemp <- predict(NHTemp.loess1, newdata=newyrs)
plot(GlbTemp$Year, GlbTemp$NHTemp, 
     col='red', pch=1,"p",cex=0.35,
     xlab="Year",
     ylab = "Temp relative to Avg 1961-1990",
     main ="LOESS models (window = 60%)")
lines(newyrs$Year, newNHTemp,
      col="red", lwd=2,"l",pch=0.2)

newSHTemp <- predict(SHTemp.loess1, newdata=newyrs)
lines(GlbTemp$Year, GlbTemp$SHTemp, col='blue', pch=1,"p",cex=0.35)
lines(newyrs$Year, newSHTemp, col="blue", lwd=2,"l",pch=0.2)
legend("topleft",legend=c("N Hem","S Hem"),
       lty=1,lwd =2, col=c("red","blue"), bty="n")


On the basis of the figures, I think the temperatures trends were not same for both hemispheres. For example, from 1000 to around 1300, temperature was increasing in Southern hemisphere while it was decreasing in Northern Hemisphere. Also during the last 200 years, temperatures in both hemisphere have increased.

NHTemp.loess1 <- loess(NHTemp ~ Year, data=GlbTemp, span=.05)
SHTemp.loess1 <- loess(SHTemp ~ Year, data=GlbTemp, span=.05)

NHTemp.loess1 <- loess(NHTemp ~ Year, data=GlbTemp, span=.15)
SHTemp.loess1 <- loess(SHTemp ~ Year, data=GlbTemp, span=.15)

NHTemp.loess1 <- loess(NHTemp ~ Year, data=GlbTemp, span=.25)
SHTemp.loess1 <- loess(SHTemp ~ Year, data=GlbTemp, span=.25)


Based on the three different windows tried (5%, 15% and 25%), emphasis on higher frequency variability decreased withe increase in span (window size), due to the nature of LOESS regression. But at 5%, the high frequency variability was captured but smoothing was lost. At 25%, the smoothing was high so high frequency variabilty was not captured well. So, I think, the best window for capturing high frequency variability for this given dataset would be at 15%.

Extra: RELATION BETWEEN SPAN AND STANDARD ERROR

Std_error_NHtemp = c()
Std_error_SHtemp = c()

span_ctr = c()
for (i in 1:100){
span_ctr = c(span_ctr, 0.01*i)
NHTemp.loess1 <- loess(NHTemp ~ Year, data=GlbTemp, span=span_ctr[i])
SHTemp.loess1 <- loess(SHTemp ~ Year, data=GlbTemp, span=span_ctr[i])
Std_error_NHtemp = c(Std_error_NHtemp, summary(NHTemp.loess1)$s)
Std_error_SHtemp = c(Std_error_SHtemp, summary(SHTemp.loess1)$s)
}
plot(span_ctr, Std_error_SHtemp,col="blue","l",
     xlab = "span", ylab ="Standard Error",
     main = "Variation of std error with span")
lines(span_ctr, Std_error_NHtemp,col="red","l")
legend("topleft",legend=c("N Hem","S Hem"),
       lty=1,lwd =2, col=c("red","blue"), bty="n")
grid()


This portion of code was writtend to check if standard error could give an idea on choosing span. The figure shows that at about span size = 0.25, the standard error almost levels out and does not increase with increasing span size. So, if the span size can be reduced below 0.25 for the specific case, without hurting smoothing too much, it would help improve the loess model.

Answer 2

pollen = read.csv("WNApollen.csv")
#head(pollen)
dim(pollen)
## [1] 2013   95
#install.packages("mgcv",repos="http://cran.us.r-project.org")
library(mgcv)
## Loading required package: nlme
## This is mgcv 1.8-7. For overview type 'help("mgcv-package")'.
#colnames(pollen)
pairs(~ARTEMISIA+gdd5+chill+mtco+mipt,data=pollen,cex=0.5)

pollen.gam = gam(ARTEMISIA~s(gdd5)+s(chill)+s(mtco)+s(mipt), data=pollen)
pollen.gam
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## ARTEMISIA ~ s(gdd5) + s(chill) + s(mtco) + s(mipt)
## 
## Estimated degrees of freedom:
## 7.47 8.49 7.43 8.67  total = 33.05 
## 
## GCV score: 0.01747729


Null Hypothesis: No relationship between response and predictor
Alternate Hypothesis: Some relationship exists between response and predictor

summary(pollen.gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## ARTEMISIA ~ s(gdd5) + s(chill) + s(mtco) + s(mipt)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.091524   0.002922   31.32   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##            edf Ref.df      F  p-value    
## s(gdd5)  7.467  8.378 10.335 9.46e-15 ***
## s(chill) 8.493  8.906 10.453 1.17e-15 ***
## s(mtco)  7.428  8.357  5.287 1.53e-06 ***
## s(mipt)  8.665  8.964 15.528  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.292   Deviance explained = 30.3%
## GCV = 0.017477  Scale est. = 0.01719   n = 2013


The results indicate that allfour terms are significant, which indicates all of the variables have a relationship with response- abudance of Artemisia.
Overall R - squared = 0.292
Percentage of deviance explained = 30.3%

pollen.gam2 = gam(ARTEMISIA~s(mtco)+s(mipt), data=pollen)
summary(pollen.gam2)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## ARTEMISIA ~ s(mtco) + s(mipt)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.091524   0.003046   30.04   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##           edf Ref.df     F p-value    
## s(mtco) 8.679  8.968 37.18  <2e-16 ***
## s(mipt) 8.186  8.822 32.04  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =   0.23   Deviance explained = 23.7%
## GCV = 0.01885  Scale est. = 0.018682  n = 2013


New R-squared = 0.23
Percentage of deviance explained = 23.7%

anova(pollen.gam2, pollen.gam)
## Analysis of Deviance Table
## 
## Model 1: ARTEMISIA ~ s(mtco) + s(mipt)
## Model 2: ARTEMISIA ~ s(gdd5) + s(chill) + s(mtco) + s(mipt)
##   Resid. Df Resid. Dev     Df Deviance
## 1    1995.1     37.274                
## 2    1979.9     34.036 15.188   3.2379

Since the decrease in deviance between the model is only 3.24, I do not consider this model to be worse than the first one (with 4 variables).

  vis.gam(pollen.gam2, theta=130, phi=40, ticktype="detailed")

# mtco = Mean winter temperature 
# mipt = Priestly-Taylor Moisture Index (0-dry, 1-wet)

From the figure, the optimal climate lies when mean winter temperature (mtco) is about -10 degrees and moisture index(mipt) is close to 0.3.

newclim <- data.frame(mtco=-10, mipt=0.35)
predict(pollen.gam2, newdata=newclim, se.fit=TRUE)
## $fit
##         1 
## 0.2324331 
## 
## $se.fit
##          1 
## 0.01276561

The predicted value of Artemisia at -10 deg C and moisture index of 0.35, is 0.2324.

Answer 3

birthwt = read.csv("lowbirthwgt.csv")
head(birthwt)
##   low age lwt race smoke ptl ht ui ftv  bwt
## 1   0  19 182    2     0   0  0  1   0 2523
## 2   0  33 155    3     0   0  0  0   3 2551
## 3   0  20 105    1     1   0  0  0   1 2557
## 4   0  21 108    1     1   0  0  1   2 2594
## 5   0  18 107    1     1   0  0  1   0 2600
## 6   0  21 124    3     0   0  0  0   0 2622
#install.packages("randomForest",repos="http://cran.us.r-project.org")
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
birthwt$low <- factor(birthwt$low, labels=c("normal","low"))
birthwt$race <- factor(birthwt$race, labels=c("white","black","other"))
birthwt$smoke <- factor(birthwt$smoke, labels=c("notsmoker","smoker"))
birthwt$ptl <- factor(birthwt$ptl, ordered=TRUE)
birthwt$ht <- factor(birthwt$ht, labels=c("noht","ht"))
birthwt$ui <- factor(birthwt$smoke, labels=c("noui","ui"))
birthwt$ftv <- factor(birthwt$ftv, ordered=TRUE)
birthwt2 <- birthwt[,-10]
head(birthwt2)
##      low age lwt  race     smoke ptl   ht   ui ftv
## 1 normal  19 182 black notsmoker   0 noht noui   0
## 2 normal  33 155 other notsmoker   0 noht noui   3
## 3 normal  20 105 white    smoker   0 noht   ui   1
## 4 normal  21 108 white    smoker   0 noht   ui   2
## 5 normal  18 107 white    smoker   0 noht   ui   0
## 6 normal  21 124 other notsmoker   0 noht noui   0
birthwt2.rf1 = randomForest(low ~ .,dat=birthwt2,ntree=500, mtry=2)
birthwt2.rf1
## 
## Call:
##  randomForest(formula = low ~ ., data = birthwt2, ntree = 500,      mtry = 2) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 32.28%
## Confusion matrix:
##        normal low class.error
## normal    114  16   0.1230769
## low        45  14   0.7627119
birthwt2.rf1$confusion
##        normal low class.error
## normal    114  16   0.1230769
## low        45  14   0.7627119


The confusion matrix shows that the error for normal babies is lot less than the low weight babies.

plot(birthwt2.rf1, main = "Model misclassification error")


The plot shows that errors have stabilized. This type of error rates is available only for classification in randomforest.

varImpPlot(birthwt2.rf1, main = "Variable importance plot ")


The most important factors controlling low birth outcomes are lwt (MOther’s weight) and age (Age of mother). There is a sudden drop in mean decrease in Gini Index after these two variables.

## Data for prediction
newdata <- read.csv("lowbirthwgtpred.csv")
newdata$low <- factor(newdata$low, levels=c("normal","low"),
                      labels=c("normal","low"))
newdata$race <- factor(newdata$race, levels=c("white","black","other"),
                       labels=c("white","black","other"))
newdata$smoke <- factor(newdata$smoke, levels=c("notsmoker","smoker"),
                        labels=c("notsmoker","smoker"))
newdata$ptl <- factor(newdata$ptl, ordered=TRUE)
newdata$ht <- factor(newdata$ht, levels=c("noht","ht"),
                     labels=c("noht","ht"))
newdata$ui <- factor(newdata$ui, levels=c("noui","ui"),
                     labels=c("noui","ui"))
newdata$ftv <- factor(newdata$ftv, ordered=TRUE)
newdata
##    low age lwt  race  smoke ptl ht   ui ftv
## 1 <NA>  21 100 white smoker   1 ht noui   3
predict(birthwt2.rf1, newdata)
##      1 
## normal 
## Levels: normal low


For the given data, random forest model predicts that the birth weight will be normal. To get the probability, we use results from all trees.

newdata.pred = predict(birthwt2.rf1, newdata, predict.all=TRUE)
table(newdata.pred$individual)
## 
##    low normal 
##    170    330


The probability of low birth for this person = 174/500 = 34.8%