Lecture4.R

user — Sep 18, 2013, 2:03 PM

#class notes-Lecture # 4
gdURL<-"http://www.stat.ubc.ca/~jenny/notOcto/STAT545A/examples/gapminder/data/gapminderDataFiveYear.txt"
gDat<-read.delim(gdURL)
str(gDat)
'data.frame':   1704 obs. of  6 variables:
 $ country  : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ year     : int  1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
 $ pop      : num  8425333 9240934 10267083 11537966 13079460 ...
 $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
 $ lifeExp  : num  28.8 30.3 32 34 36.1 ...
 $ gdpPercap: num  779 821 853 836 740 ...
#read.table(gdURL, header =TRUE, sep="\t",quote="\"")
library(lattice)
library(plyr)

maxLeByCont <- ddply(gDat,~continent, summarize,maxLifeExp=max(lifeExp))
str(maxLeByCont)
'data.frame':   5 obs. of  2 variables:
 $ continent : Factor w/ 5 levels "Africa","Americas",..: 1 2 3 4 5
 $ maxLifeExp: num  76.4 80.7 82.6 81.8 81.2

minGDPByCont <- ddply(gDat,~continent, summarize,minGDP=min(gdpPercap))
minGDPByCont
  continent  minGDP
1    Africa   241.2
2  Americas  1201.6
3      Asia   331.0
4    Europe   973.5
5   Oceania 10039.6

str(minGDPByCont)
'data.frame':   5 obs. of  2 variables:
 $ continent: Factor w/ 5 levels "Africa","Americas",..: 1 2 3 4 5
 $ minGDP   : num  241 1202 331 974 10040

length(unique(gDat$country))
[1] 142

ddply(gDat, ~continent, summarize, nUniqCoutries = length(unique(country)))
  continent nUniqCoutries
1    Africa            52
2  Americas            25
3      Asia            33
4    Europe            30
5   Oceania             2

ddply(gDat, ~ continent, summarize, minlifeExp = min(lifeExp), maxlifeExp = max(lifeExp), gdpPercap = median(gdpPercap))
  continent minlifeExp maxlifeExp gdpPercap
1    Africa      23.60      76.44      1192
2  Americas      37.58      80.65      5466
3      Asia      28.80      82.60      2647
4    Europe      43.59      81.76     12082
5   Oceania      69.12      81.23     17983


jCountry <- "France"  # pick, but do not hard wire, on example
(jDat <- subset(gDat, country == jCountry)) 
    country year      pop continent lifeExp gdpPercap
529  France 1952 42459667    Europe   67.41      7030
530  France 1957 44310863    Europe   68.93      8663
531  France 1962 47124000    Europe   70.51     10560
532  France 1967 49569000    Europe   71.55     13000
533  France 1972 51732000    Europe   72.38     16107
534  France 1977 53165019    Europe   73.83     18293
535  France 1982 54433565    Europe   74.89     20294
536  France 1987 55630100    Europe   76.34     22066
537  France 1992 57374179    Europe   77.46     24704
538  France 1997 58623428    Europe   78.64     25890
539  France 2002 59925035    Europe   79.59     28926
540  France 2007 61083916    Europe   80.66     30470
xyplot(lifeExp ~ year, jDat, type = c("p", "r"))

plot of chunk unnamed-chunk-1

jFit <- lm(lifeExp ~ year, jDat)
summary(jFit)

Call:
lm(formula = lifeExp ~ year, data = jDat)

Residuals:
    Min      1Q  Median      3Q     Max 
-0.3801 -0.1389  0.0124  0.1429  0.3349 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -3.98e+02   7.29e+00   -54.6  1.0e-13 ***
year         2.38e-01   3.68e-03    64.8  1.9e-14 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.22 on 10 degrees of freedom
Multiple R-squared:  0.998, Adjusted R-squared:  0.997 
F-statistic: 4.2e+03 on 1 and 10 DF,  p-value: 1.86e-14
(yearMin <- min(gDat$year))
[1] 1952
jFit <- lm(lifeExp ~ I(year - yearMin), jDat)
summary(jFit)

Call:
lm(formula = lifeExp ~ I(year - yearMin), data = jDat)

Residuals:
    Min      1Q  Median      3Q     Max 
-0.3801 -0.1389  0.0124  0.1429  0.3349 

Coefficients:
                  Estimate Std. Error t value Pr(>|t|)    
(Intercept)       67.79013    0.11949   567.3  < 2e-16 ***
I(year - yearMin)  0.23850    0.00368    64.8  1.9e-14 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.22 on 10 degrees of freedom
Multiple R-squared:  0.998, Adjusted R-squared:  0.997 
F-statistic: 4.2e+03 on 1 and 10 DF,  p-value: 1.86e-14

#I inhabit interpretation of Objects
?I
starting httpd help server ... done

class(jFit)
[1] "lm"
model(jFit)
Error: could not find function "model"
names(jFit)
 [1] "coefficients"  "residuals"     "effects"       "rank"         
 [5] "fitted.values" "assign"        "qr"            "df.residual"  
 [9] "xlevels"       "call"          "terms"         "model"        


jFit$coefficients
      (Intercept) I(year - yearMin) 
          67.7901            0.2385 
coef(jFit)
      (Intercept) I(year - yearMin) 
          67.7901            0.2385 

jFun <- function(x) coef(lm(lifeExp ~ I(year - yearMin), x))
jFun(jDat)  # trying out our new function ... yes still get same numbers
      (Intercept) I(year - yearMin) 
          67.7901            0.2385 


jFun <- function(x) {
  estCoefs <- coef(lm(lifeExp ~ I(year - yearMin), x))
  names(estCoefs) <- c("intercept", "slope")
  return(estCoefs)
  }
jFun(jDat)  # trying out our improved function ... yes still get same numbers
intercept     slope 
  67.7901    0.2385 


jFun(subset(gDat, country == "Canada"))
intercept     slope 
  68.8838    0.2189 
jFun(subset(gDat, country == "Uruguay"))
intercept     slope 
  65.7416    0.1833 
jFun(subset(gDat, country == "India"))
intercept     slope 
  39.2698    0.5053 
jCoefs <- ddply(gDat, ~country, jFun)
str(jCoefs)
'data.frame':   142 obs. of  3 variables:
 $ country  : Factor w/ 142 levels "Afghanistan",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ intercept: num  29.9 59.2 43.4 32.1 62.7 ...
 $ slope    : num  0.275 0.335 0.569 0.209 0.232 ...
tail(jCoefs)
               country intercept    slope
137          Venezuela     57.51  0.32972
138            Vietnam     39.01  0.67162
139 West Bank and Gaza     43.80  0.60110
140        Yemen, Rep.     30.13  0.60546
141             Zambia     47.66 -0.06043
142           Zimbabwe     55.22 -0.09302

install.packages("xtable", dependencies = TRUE)
Installing package into 'C:/Users/user/Documents/R/win-library/3.0' (as
'lib' is unspecified)
Error: trying to use CRAN without setting a mirror
library(xtable)
jCoefs <- ddply(gDat, ~country, jFun)
#jCoefs


jCoefs <- ddply(gDat, ~country + continent, jFun)
str(jCoefs)
'data.frame':   142 obs. of  4 variables:
 $ country  : Factor w/ 142 levels "Afghanistan",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 4 1 1 2 5 4 3 3 4 ...
 $ intercept: num  29.9 59.2 43.4 32.1 62.7 ...
 $ slope    : num  0.275 0.335 0.569 0.209 0.232 ...

tail(jCoefs)
               country continent intercept    slope
137          Venezuela  Americas     57.51  0.32972
138            Vietnam      Asia     39.01  0.67162
139 West Bank and Gaza      Asia     43.80  0.60110
140        Yemen, Rep.      Asia     30.13  0.60546
141             Zambia    Africa     47.66 -0.06043
142           Zimbabwe    Africa     55.22 -0.09302

set.seed(916)
foo<-jCoefs[sample(nrow(jCoefs),size =15),]
foo<-xtable(foo)
foo
% latex table generated in R 3.0.1 by xtable 1.7-1 package
% Wed Sep 18 14:03:55 2013
\begin{table}[ht]
\centering
\begin{tabular}{rllrr}
  \hline
 & country & continent & intercept & slope \\ 
  \hline
73 & Lebanon & Asia & 58.69 & 0.26 \\ 
  111 & Senegal & Africa & 36.75 & 0.50 \\ 
  37 & Dominican Republic & Americas & 48.60 & 0.47 \\ 
  97 & Oman & Asia & 37.21 & 0.77 \\ 
  48 & Germany & Europe & 67.57 & 0.21 \\ 
  70 & Korea, Dem. Rep. & Asia & 54.91 & 0.32 \\ 
  82 & Mauritius & Africa & 55.37 & 0.35 \\ 
  115 & Slovak Republic & Europe & 67.01 & 0.13 \\ 
  27 & Comoros & Africa & 40.00 & 0.45 \\ 
  5 & Argentina & Americas & 62.69 & 0.23 \\ 
  22 & Central African Republic & Africa & 38.81 & 0.18 \\ 
  38 & Ecuador & Americas & 49.07 & 0.50 \\ 
  139 & West Bank and Gaza & Asia & 43.80 & 0.60 \\ 
  39 & Egypt & Africa & 40.97 & 0.56 \\ 
  88 & Myanmar & Asia & 41.41 & 0.43 \\ 
   \hline
\end{tabular}
\end{table}
print(foo,type = "html",include.rownames=FALSE)
<!-- html table generated in R 3.0.1 by xtable 1.7-1 package -->
<!-- Wed Sep 18 14:03:55 2013 -->
<TABLE border=1>
<TR> <TH> country </TH> <TH> continent </TH> <TH> intercept </TH> <TH> slope </TH>  </TR>
  <TR> <TD> Lebanon </TD> <TD> Asia </TD> <TD align="right"> 58.69 </TD> <TD align="right"> 0.26 </TD> </TR>
  <TR> <TD> Senegal </TD> <TD> Africa </TD> <TD align="right"> 36.75 </TD> <TD align="right"> 0.50 </TD> </TR>
  <TR> <TD> Dominican Republic </TD> <TD> Americas </TD> <TD align="right"> 48.60 </TD> <TD align="right"> 0.47 </TD> </TR>
  <TR> <TD> Oman </TD> <TD> Asia </TD> <TD align="right"> 37.21 </TD> <TD align="right"> 0.77 </TD> </TR>
  <TR> <TD> Germany </TD> <TD> Europe </TD> <TD align="right"> 67.57 </TD> <TD align="right"> 0.21 </TD> </TR>
  <TR> <TD> Korea, Dem. Rep. </TD> <TD> Asia </TD> <TD align="right"> 54.91 </TD> <TD align="right"> 0.32 </TD> </TR>
  <TR> <TD> Mauritius </TD> <TD> Africa </TD> <TD align="right"> 55.37 </TD> <TD align="right"> 0.35 </TD> </TR>
  <TR> <TD> Slovak Republic </TD> <TD> Europe </TD> <TD align="right"> 67.01 </TD> <TD align="right"> 0.13 </TD> </TR>
  <TR> <TD> Comoros </TD> <TD> Africa </TD> <TD align="right"> 40.00 </TD> <TD align="right"> 0.45 </TD> </TR>
  <TR> <TD> Argentina </TD> <TD> Americas </TD> <TD align="right"> 62.69 </TD> <TD align="right"> 0.23 </TD> </TR>
  <TR> <TD> Central African Republic </TD> <TD> Africa </TD> <TD align="right"> 38.81 </TD> <TD align="right"> 0.18 </TD> </TR>
  <TR> <TD> Ecuador </TD> <TD> Americas </TD> <TD align="right"> 49.07 </TD> <TD align="right"> 0.50 </TD> </TR>
  <TR> <TD> West Bank and Gaza </TD> <TD> Asia </TD> <TD align="right"> 43.80 </TD> <TD align="right"> 0.60 </TD> </TR>
  <TR> <TD> Egypt </TD> <TD> Africa </TD> <TD align="right"> 40.97 </TD> <TD align="right"> 0.56 </TD> </TR>
  <TR> <TD> Myanmar </TD> <TD> Asia </TD> <TD align="right"> 41.41 </TD> <TD align="right"> 0.43 </TD> </TR>
   </TABLE>