Clase 4.R

Roger Guevara — May 22, 2014, 6:01 PM

rm(list=ls()) #elimina TODOS los objetos del ambiente de trabajo
source("~/desktop/cursos R/2013/funciones.r")

DATOS <- read.table("~/desktop/cursos R/2014/productivity.csv", header = TRUE, sep =",") 
attach(DATOS)
names(DATOS)
[1] "productivity" "mammals"      "region"      

sc(productivity, region)

M <- lm(productivity ~ region)
summary(M)

Call:
lm(formula = productivity ~ region)

Residuals:
   Min     1Q Median     3Q    Max 
 -7.00  -2.25   0.00   2.25   7.00 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)     4.50       1.03    4.39  6.4e-05 ***
regionb         3.00       1.45    2.07    0.044 *  
regionc         7.00       1.45    4.83  1.5e-05 ***
regiond        12.00       1.45    8.27  1.0e-10 ***
regione        10.00       1.45    6.89  1.2e-08 ***
regionf        18.50       1.45   12.75  < 2e-16 ***
regiong        14.00       1.57    8.94  1.1e-11 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

Residual standard error: 2.9 on 47 degrees of freedom
Multiple R-squared: 0.829,  Adjusted R-squared: 0.807 
F-statistic:   38 on 6 and 47 DF,  p-value: <2e-16 
summary.aov(M)
            Df Sum Sq Mean Sq F value Pr(>F)    
region       6   1920     320      38 <2e-16 ***
Residuals   47    396       8                   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
summary(M)

Call:
lm(formula = productivity ~ region)

Residuals:
   Min     1Q Median     3Q    Max 
 -7.00  -2.25   0.00   2.25   7.00 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)     4.50       1.03    4.39  6.4e-05 ***
regionb         3.00       1.45    2.07    0.044 *  
regionc         7.00       1.45    4.83  1.5e-05 ***
regiond        12.00       1.45    8.27  1.0e-10 ***
regione        10.00       1.45    6.89  1.2e-08 ***
regionf        18.50       1.45   12.75  < 2e-16 ***
regiong        14.00       1.57    8.94  1.1e-11 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

Residual standard error: 2.9 on 47 degrees of freedom
Multiple R-squared: 0.829,  Adjusted R-squared: 0.807 
F-statistic:   38 on 6 and 47 DF,  p-value: <2e-16 

par(mfrow=c(2, 2))

plot of chunk unnamed-chunk-1

plot(M)

plot of chunk unnamed-chunk-1


#Cambiar order y naombres de las etiquetas en un factor
#levels(region) <- list("G" = "g", "B" = "b", "C" = "c", "D" = "d", "E" = "e", "F" = "f", "A" = "a")

#Generar vectores para contrastes
A <- c(1,0,0,0,0,0,0)
B <- c(1,1,0,0,0,0,0)
C <- c(1,0,1,0,0,0,0)
D <- c(1,0,0,1,0,0,0)
E <- c(1,0,0,0,1,0,0)
F <- c(1,0,0,0,0,1,0)
G <- c(1,0,0,0,0,0,1)
DB_C <- B-C
DB_D <- B-D
DB_E <- B-E
DD_E <- D-E


library(gmodels)
Warning: package 'gmodels' was built under R version 2.15.3
estimable(M, rbind(A,B,C,D,E,F,G,DB_C,DB_D, DB_E, DD_E))
     Estimate Std. Error t value DF  Pr(>|t|)
A         4.5      1.026   4.388 47 6.436e-05
B         7.5      1.026   7.313 47 2.747e-09
C        11.5      1.026  11.213 47 7.105e-15
D        16.5      1.026  16.088 47 0.000e+00
E        14.5      1.026  14.138 47 0.000e+00
F        23.0      1.026  22.426 47 0.000e+00
G        18.5      1.184  15.622 47 0.000e+00
DB_C     -4.0      1.450  -2.758 47 8.262e-03
DB_D     -9.0      1.450  -6.205 47 1.316e-07
DB_E     -7.0      1.450  -4.826 47 1.510e-05
DD_E      2.0      1.450   1.379 47 1.745e-01
tapply(productivity, region, ee)
     a      b      c      d      e      f      g 
0.8660 0.8660 0.8660 0.8660 0.8660 1.7321 0.7638 

MEDIAS <- estimable(M, rbind(A,B,C,D,E,F,G))[, 1]
ERRORES <- estimable(M, rbind(A,B,C,D,E,F,G))[, 2]

par(mfrow=c(1, 1))
barplot.error(MEDIAS, ERRORES, col = "red", names.arg=LETTERS[1:7], ylim = c(0,25))

text(c(2,5,8,11,14,17,20), c(6,9,13,18,16,25,20), c("a", "b", "c", "ed", "d", "f", "e"))

plot of chunk unnamed-chunk-1





####COVARIANZA

M <- lm(mammals ~ productivity + region + productivity:region)
M <- lm(mammals ~ productivity * region)
summary.aov(M)
                    Df Sum Sq Mean Sq F value Pr(>F)    
productivity         1   1359    1359   455.8 <2e-16 ***
region               6   1097     183    61.3 <2e-16 ***
productivity:region  6     32       5     1.8   0.12    
Residuals           40    119       3                   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
summary(M)

Call:
lm(formula = mammals ~ productivity * region)

Residuals:
   Min     1Q Median     3Q    Max 
-5.964 -0.664  0.125  0.927  3.143 

Coefficients:
                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)             3.464      1.345    2.57   0.0138 *  
productivity           -0.298      0.266   -1.12   0.2706    
regionb                 6.250      2.485    2.52   0.0160 *  
regionc                17.476      3.402    5.14  7.6e-06 ***
regiond                22.964      4.638    4.95  1.4e-05 ***
regione                19.833      4.136    4.80  2.3e-05 ***
regionf                27.429      3.402    8.06  6.5e-10 ***
regiong                24.355      7.785    3.13   0.0033 ** 
productivity:regionb   -0.464      0.377   -1.23   0.2251    
productivity:regionc   -0.893      0.377   -2.37   0.0227 *  
productivity:regiond   -0.524      0.377   -1.39   0.1722    
productivity:regione   -0.714      0.377   -1.90   0.0652 .  
productivity:regionf   -0.149      0.298   -0.50   0.6201    
productivity:regiong   -0.188      0.491   -0.38   0.7038    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

Residual standard error: 1.73 on 40 degrees of freedom
Multiple R-squared: 0.954,  Adjusted R-squared: 0.939 
F-statistic: 64.2 on 13 and 40 DF,  p-value: <2e-16 

plot(productivity, mammals, type = "n")
LEV <- levels(region)
for(i in 1:7){
  points(productivity[region==LEV[i]], mammals[region==LEV[i]], col =i, pch = 19)}


PA <- c(0,1,0,0,0,0,0,0,0,0,0,0,0,0)
PB <- c(0,1,0,0,0,0,0,0,1,0,0,0,0,0)
PC <- c(0,1,0,0,0,0,0,0,0,1,0,0,0,0)
PD <- c(0,1,0,0,0,0,0,0,0,0,1,0,0,0)
PE <- c(0,1,0,0,0,0,0,0,0,0,0,1,0,0)
PF <- c(0,1,0,0,0,0,0,0,0,0,0,0,1,0)
PG <- c(0,1,0,0,0,0,0,0,0,0,0,0,0,1)

LEV2 <- character()
CONT <- matrix(NA, 21, 5)
MATRIZ <- rbind(PA,PB,PC,PD,PE,PF,PG)
k <- 2
CC <- 0
for(i in (1:6)){
  for(j in k:7){
    CC <- CC +1
    CONT[CC, ] <- unlist(estimable(M, MATRIZ[i,]- MATRIZ[j,])[,1:5])
    LEV2 <- c(LEV2, paste(LEV[i], LEV[j]))
  }
  k<-k+1

}

rownames(CONT) <- LEV2
colnames(CONT) <- c("Estimate", "se", "tval", "df", "Pval")

abline
function (a = NULL, b = NULL, h = NULL, v = NULL, reg = NULL, 
    coef = NULL, untf = FALSE, ...) 
{
    int_abline <- function(a, b, h, v, untf, col = par("col"), 
        lty = par("lty"), lwd = par("lwd"), ...) .Internal(abline(a, 
        b, h, v, untf, col, lty, lwd, ...))
    if (!is.null(reg)) {
        if (!is.null(a)) 
            warning("'a' is overridden by 'reg'")
        a <- reg
    }
    if (is.object(a) || is.list(a)) {
        p <- length(coefa <- as.vector(coef(a)))
        if (p > 2) 
            warning(gettextf("only using the first two of %d regression coefficients", 
                p), domain = NA)
        islm <- inherits(a, "lm")
        noInt <- if (islm) 
            !as.logical(attr(stats::terms(a), "intercept"))
        else p == 1
        if (noInt) {
            a <- 0
            b <- coefa[1L]
        }
        else {
            a <- coefa[1L]
            b <- if (p >= 2) 
                coefa[2L]
            else 0
        }
    }
    if (!is.null(coef)) {
        if (!is.null(a)) 
            warning("'a' and 'b' are overridden by 'coef'")
        a <- coef[1L]
        b <- coef[2L]
    }
    int_abline(a = a, b = b, h = h, v = v, untf = untf, ...)
    invisible()
}
<bytecode: 0x100f68f90>
<environment: namespace:graphics>

summary(M)

Call:
lm(formula = mammals ~ productivity * region)

Residuals:
   Min     1Q Median     3Q    Max 
-5.964 -0.664  0.125  0.927  3.143 

Coefficients:
                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)             3.464      1.345    2.57   0.0138 *  
productivity           -0.298      0.266   -1.12   0.2706    
regionb                 6.250      2.485    2.52   0.0160 *  
regionc                17.476      3.402    5.14  7.6e-06 ***
regiond                22.964      4.638    4.95  1.4e-05 ***
regione                19.833      4.136    4.80  2.3e-05 ***
regionf                27.429      3.402    8.06  6.5e-10 ***
regiong                24.355      7.785    3.13   0.0033 ** 
productivity:regionb   -0.464      0.377   -1.23   0.2251    
productivity:regionc   -0.893      0.377   -2.37   0.0227 *  
productivity:regiond   -0.524      0.377   -1.39   0.1722    
productivity:regione   -0.714      0.377   -1.90   0.0652 .  
productivity:regionf   -0.149      0.298   -0.50   0.6201    
productivity:regiong   -0.188      0.491   -0.38   0.7038    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

Residual standard error: 1.73 on 40 degrees of freedom
Multiple R-squared: 0.954,  Adjusted R-squared: 0.939 
F-statistic: 64.2 on 13 and 40 DF,  p-value: <2e-16 
abline(3.4643, -0.2976)
abline(3.4643+6.2500, -0.2976 + -0.4643, col = "red")
abline(3.4643+17.4762, -0.2976 + -0.8929, col = "green")
abline(3.4643+22.9643, -0.2976 + -0.5238, col = "purple")
abline(3.4643+19.8333, -0.2976 + -0.7143, col = "cyan")
abline(3.4643+27.4286, -0.2976 + -0.1488, col = "pink")
abline(3.4643+24.3548, -0.2976 + -0.1881, col = "yellow")

plot of chunk unnamed-chunk-1



plot(productivity, mammals, type = "n")
LEV <- levels(region)
for(i in 1:7){
  points(productivity[region==LEV[i]], mammals[region==LEV[i]], col =i, pch = 19)}

K <- 0
for(i in LEV){
  K <- K + 1
SEQ <- seq(min(productivity[region== i]), max(productivity[region== i]), length = 30)
PRED <- predict(M, data.frame(region = i, productivity = SEQ))
lines(SEQ, PRED, col = K, lwd =3)}

plot of chunk unnamed-chunk-1



par(bg="darkgrey", fg = "red")
plot(productivity, mammals, type = "n")
LEV <- levels(region)
for(i in 1:7){
  points(productivity[region==LEV[i]], mammals[region==LEV[i]], col =i, pch = 19)}

K <- 0
for(i in LEV){
  K <- K + 1
  SEQ <- seq(min(productivity[region== i]), max(productivity[region == i]), length = 30)
  PRED <- predict(M, data.frame(region = i, productivity = SEQ), se = TRUE)
  lines(SEQ, PRED$fit, col = K, lwd =3)
  lines(SEQ, PRED$fit - PRED$se.fit*2, col = K, lwd =1, lty= 2)
  lines(SEQ, PRED$fit + PRED$se.fit*2, col = K, lwd =1, lty= 2)
}

plot of chunk unnamed-chunk-1



###SIMPLIFICAR
M2 <- lm(mammals ~ productivity + region)
anova(M2)
Analysis of Variance Table

Response: mammals
             Df Sum Sq Mean Sq F value Pr(>F)    
productivity  1   1359    1359   412.8 <2e-16 ***
region        6   1097     183    55.6 <2e-16 ***
Residuals    46    151       3                   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
anova(M2, M)
Analysis of Variance Table

Model 1: mammals ~ productivity + region
Model 2: mammals ~ productivity * region
  Res.Df RSS Df Sum of Sq   F Pr(>F)
1     46 151                        
2     40 119  6      32.2 1.8   0.12
AIC(M2, M)
   df   AIC
M2  9 226.9
M  15 226.0


par(bg="darkgrey", fg = "red")
plot(productivity, mammals, type = "n")
LEV <- levels(region)
for(i in 1:7){
  points(productivity[region==LEV[i]], mammals[region==LEV[i]], col =i, pch = 19)}

K <- 0
for(i in LEV){
  K <- K + 1
  SEQ <- seq(min(productivity[region== i]), max(productivity[region == i]), length = 30)
  PRED <- predict(M2, data.frame(region = i, productivity = SEQ), se = TRUE)
  lines(SEQ, PRED$fit, col = K, lwd =3)
  lines(SEQ, PRED$fit - PRED$se.fit*2, col = K, lwd =1, lty= 2)
  lines(SEQ, PRED$fit + PRED$se.fit*2, col = K, lwd =1, lty= 2)
}

plot of chunk unnamed-chunk-1





#####
DATOS$TV <- factor(rep(c("CD", "PN"), each =27))
DATOS$CONTI <- factor(rep(c("AM", "SEA","SEA", "AM"), c(13, 14, 14, 13)))
attach(DATOS)
The following object(s) are masked from 'DATOS (position 4)':

    mammals, productivity, region

M <- lm(productivity ~ TV*CONTI)
summary(M)

Call:
lm(formula = productivity ~ TV * CONTI)

Residuals:
   Min     1Q Median     3Q    Max 
-5.462 -2.022  0.033  2.088  8.538 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)      5.077      0.829    6.12  1.4e-07 ***
TVPN            16.385      1.172   13.98  < 2e-16 ***
CONTISEA         6.637      1.151    5.77  5.1e-07 ***
TVPN:CONTISEA  -12.242      1.628   -7.52  9.3e-10 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

Residual standard error: 2.99 on 50 degrees of freedom
Multiple R-squared: 0.807,  Adjusted R-squared: 0.795 
F-statistic: 69.7 on 3 and 50 DF,  p-value: <2e-16 
plot(M)

plot of chunk unnamed-chunk-1 plot of chunk unnamed-chunk-1 plot of chunk unnamed-chunk-1 plot of chunk unnamed-chunk-1