Clase 3.R

Roger Guevara — May 22, 2014, 6:25 AM

#Ejercicio sobre distribución de eventos en el tiempo/espacio. Germinación de semillas
#K es el número de semillas que germinaron en lotes de siete semillas
K <- 0:7

#FQ es la frecuencia con que se obervó K
FQ <- c(3,14,20,24,19,11,8,1)

#Para visualizarlo en formato de tabla
data.frame(K,FQ)
  K FQ
1 0  3
2 1 14
3 2 20
4 3 24
5 4 19
6 5 11
7 6  8
8 7  1

#FQK es el producto de K por FQ, y la suma de FQK nos da el total de semillas que germinaron
FQK <- FQ*K
TOTAL_GERM <- sum(FQK)

#La suma de FQ nos da el total de unidades experimentales
MT <- sum(FQ)

#El promeio es el total de semillas que germinaron entre el número total de unidades experimentales
MEDIA <- TOTAL_GERM/MT

#La probabilidad de que germine una semilla es la MEDIAS entre el número de semillas en cada unidad experimental, que son siete.
P <- MEDIA/7

#La probabilidad de que no germine una semilla es 1- P, es decir el complemento
Q <- 1-P

#Con la distribución binomil ponemos a prueba la Ho de que la germinación de cada semilla en cada lote es un evento independiente. Bajo esta Ho calculamos la propbabilidad de obervar 0, 1, 2 ... 7 semillas germinadas en un lote. La probabilidad de observar , por jemplo 3 semillas germinadas es 35 * P^3 * Q^4.   P^3 es la probabilidad de observar que tres semillas germinen independientemente, es decir P*P*P, y Q^4 es su contraparte, la probabilidad de que cuatro semillas no germinen (Q*Q*Q*Q). Treinta y cinco es el número de combinaciones en las que podemos observar en un lote de siete semillas tres germinadas y cuatro  o germinadas, pueden germinar la semillas 1,2 y 3; o las semillas 1,2 y 4; o las semillas 1,2 y 5, etc. En total 35 formas posibles. Esto se calcula con el tríangulo de Pascal. Se forman dos diagonales divergentes con el número 1, y luego empezando en el segundo reglon se suman los valores del rengón  anterior que flanquean cada una de las posciones en el renglón dos, tres, etc.
#            1 1
#          1  2  1
#        1  3  3   1
#      1   4  6  4   1
#    1   5  10  10  5  1
#  1   6  15  20  15  6  1
#1  7   21  35   35 21  7  1


P^K
[1] 1.000000 0.445714 0.198661 0.088546 0.039466 0.017591 0.007840 0.003495
Q^(7:0)
[1] 0.01607 0.02900 0.05232 0.09439 0.17029 0.30723 0.55429 1.00000
COEF <- c(1,7,21,35,35,21,7,1)

PROBABILIDADES <- P^K * Q^(7:0) * COEF

#La suma de las probabilidades da uno exacto en cualquier caso calculado con la disribución binomial
sum(PROBABILIDADES)
[1] 1

#Las probabilidades multiplicadas por el número total de unidades experimentales nos da la frecuecnia esperada por azar. 
FQE <- PROBABILIDADES * MT

#desplegamos una tabla con los distintos valores calculados 
data.frame(K,FQ, FQK, COEF, PROBABILIDADES, FQE)
  K FQ FQK COEF PROBABILIDADES     FQE
1 0  3   0    1       0.016074  1.6074
2 1 14  14    7       0.090481  9.0481
3 2 20  40   21       0.218273 21.8273
4 3 24  72   35       0.292531 29.2531
5 4 19  76   35       0.235231 23.5231
6 5 11  55   21       0.113493 11.3493
7 6  8  48    7       0.030421  3.0421
8 7  1   7    1       0.003495  0.3495

#alculamos el valor de ji-cuadrada 
X2 <- sum((FQ - FQE)^2/FQE)

# Y calculamos la probabilidad de observar dicho valor de ji-cuadrada
pchisq(X2, 7, lower = FALSE)
[1] 0.0337


?chisq.test


###
ppois(1, lambda =3.12)
[1] 0.1819
rpois(1, lambda = 3)
[1] 2
ppois(0, lambda =3.12, lower = FALSE)
[1] 0.9558
dpois(1, lambda =3.12)
[1] 0.1378
PROBABILIDADES <- dpois(0:6, lambda =3.12)
sum(PROBABILIDADES)
[1] 0.9601
PROBABILIDADES[8] <- 1- sum(PROBABILIDADES)
sum(PROBABILIDADES)
[1] 1


FQE <- PROBABILIDADES * 100

data.frame(K,FQ, FQK, PROBABILIDADES, FQE)
  K FQ FQK PROBABILIDADES    FQE
1 0  3   0        0.04416  4.416
2 1 14  14        0.13777 13.777
3 2 20  40        0.21492 21.492
4 3 24  72        0.22352 22.352
5 4 19  76        0.17434 17.434
6 5 11  55        0.10879 10.879
7 6  8  48        0.05657  5.657
8 7  1   7        0.03993  3.993

X2 <- sum((FQ - FQE)^2/FQE)
X2
[1] 4.038
pchisq(X2, 7, lower = FALSE)
[1] 0.7754



#Otro Ejercicio
K <- 0:6
FQ <- c(4,9,25,25,26,9,2)

data.frame(K,FQ)
  K FQ
1 0  4
2 1  9
3 2 25
4 3 25
5 4 26
6 5  9
7 6  2

FQK <- FQ*K

MT <- sum(FQ)

TOTAL_DISP <- sum(FQK)

MEDIA <- TOTAL_DISP/MT

P <- MEDIA/6
Q <- 1-P

P^K
[1] 1.00000 0.49167 0.24174 0.11885 0.05844 0.02873 0.01413
Q^(6:0)
[1] 0.01725 0.03394 0.06677 0.13135 0.25840 0.50833 1.00000
Q^K[7:1]
[1] 0.01725 0.03394 0.06677 0.13135 0.25840 0.50833 1.00000
factorial(6)/(factorial(3)*factorial(3))
[1] 20
COEF <- c(1,6,15,20,15,6,1)

PROBABILIDADES <- P^K * Q^(K[7:1]) * COEF
sum(PROBABILIDADES)
[1] 1

FQE <- PROBABILIDADES * 100

data.frame(K,FQ, FQK, COEF, PROBABILIDADES, FQE)
  K FQ FQK COEF PROBABILIDADES    FQE
1 0  4   0    1        0.01725  1.725
2 1  9   9    6        0.10013 10.013
3 2 25  50   15        0.24212 24.212
4 3 25  75   20        0.31224 31.224
5 4 26 104   15        0.22650 22.650
6 5  9  45    6        0.08763  8.763
7 6  2  12    1        0.01413  1.413

X2 <- sum((FQ - FQE)^2/FQE)
X2
[1] 5.113
pchisq(X2, 6, lower = FALSE)
[1] 0.5293




###

PROBABILIDADES <- dpois(0:5, lambda = 2.95)
sum(PROBABILIDADES)
[1] 0.921
PROBABILIDADES[7] <- 1- sum(PROBABILIDADES)
sum(PROBABILIDADES)
[1] 1


FQE <- PROBABILIDADES * 100

data.frame(K,FQ, FQK, PROBABILIDADES, FQE)
  K FQ FQK PROBABILIDADES    FQE
1 0  4   0        0.05234  5.234
2 1  9   9        0.15440 15.440
3 2 25  50        0.22774 22.774
4 3 25  75        0.22395 22.395
5 4 26 104        0.16516 16.516
6 5  9  45        0.09745  9.745
7 6  2  12        0.07896  7.896

X2 <- sum((FQ - FQE)^2/FQE)
X2
[1] 13.4
pchisq(X2, 6, lower = FALSE)
[1] 0.03706

OE <- rbind(FQ, FQE, FQ)
colnames(OE) <- 0:6
barplot(OE, col = c("darkgreen", "darkred"), beside = TRUE, density = 16, angle=45, names.arg = LETTERS[20:26])
legend(16, 27, c("Obs", "Esp"), fill =c("darkgreen", "darkred"), bty = "n", density = 16, angle=45,cex=2)

plot of chunk unnamed-chunk-1





#CORRELACIÓN
rm(list=ls()) #elimina TODOS los objetos del ambiente de trabajo

DATOS <- read.table("~/desktop/cursos R/2014/productivity.csv", header = TRUE, sep =",") 
attach(DATOS)
names(DATOS)
[1] "productivity" "mammals"      "region"      

plot(productivity, mammals)

plot of chunk unnamed-chunk-1


length(table(region))
[1] 7

PROD_MEDIA <- mean (productivity)
MAM_MEDIA <- mean (mammals)

sum((productivity-PROD_MEDIA) *
(mammals-MAM_MEDIA))/53
[1] 33.47

sd(productivity)*sd(mammals)
[1] 46.36

cor(productivity[region=="g"], mammals[region=="g"])
[1] -0.7773

cor(productivity[region=="f"], mammals[region=="f"])
[1] -0.6318
cor(productivity[region=="e"], mammals[region=="e"])
[1] -0.8475
cor(productivity[region=="d"], mammals[region=="d"])
[1] -0.7463
cor(productivity[region=="c"], mammals[region=="c"])
[1] -0.9124
cor(productivity[region=="b"], mammals[region=="b"])
[1] -0.8468
cor(productivity[region=="a"], mammals[region=="a"])
[1] -0.6474

plot(productivity, mammals, type = "n")

LEV <- levels(region)
for(i in 1:7){
points(productivity[region==LEV[i]], mammals[region==LEV[i]], col =i, pch = 19)}

plot of chunk unnamed-chunk-1



CORCOEF <- numeric()
for(i in 1:1000){
MALEAT <- sample(1:54, replace = TRUE)
CORCOEF[i] <- cor(productivity[MALEAT], mammals[MALEAT])
}

hist(CORCOEF)

plot of chunk unnamed-chunk-1

quantile(CORCOEF, c(0.025, 0.5, 0.975))
  2.5%    50%  97.5% 
0.6241 0.7256 0.8067 
0.7219081/(sd(CORCOEF)*2)
[1] 7.577
pt(15.72646, 53, lower=FALSE) * 2
[1] 1.305e-21
cor.test(productivity, mammals)

    Pearson's product-moment correlation

data:  productivity and mammals 
t = 7.523, df = 52, p-value = 7.268e-10
alternative hypothesis: true correlation is not equal to 0 
95 percent confidence interval:
 0.5630 0.8294 
sample estimates:
   cor 
0.7219 


#
for(j in LEV){
  CORCOEF <- numeric()
  for(i in 1:100){
    PROD_REG_X <- productivity[region== j]
    MAM_REG_X <- mammals[region== j]
    MALEAT <- sample(1:length(PROD_REG_X), replace = TRUE)
    CORCOEF[i] <- cor(PROD_REG_X[MALEAT], MAM_REG_X[MALEAT])
  }
  print(j)
  print(quantile(CORCOEF, c(0.025, 0.5, 0.975)))
  EE <- sd(CORCOEF)
  R <- cor(PROD_REG_X, MAM_REG_X)
  Tval <- R/EE
  Pval <- pt(abs(Tval), length(PROD_REG_X)-2, lower=FALSE)*2
  print(rbind(R, EE, Tval, Pval, "", "", "", ""))
}
[1] "a"
   2.5%     50%   97.5% 
-0.9217 -0.6709 -0.2363 
     [,1]                
R    "-0.647442474054588"
EE   "0.185247116818675" 
Tval "-3.4950205173143"  
Pval "0.0129042624402205"
     ""                  
     ""                  
     ""                  
     ""                  
[1] "b"
   2.5%     50%   97.5% 
-0.9717 -0.8862 -0.5758 
     [,1]                 
R    "-0.846809798439941" 
EE   "0.164604669177574"  
Tval "-5.14450654814908"  
Pval "0.00212663262140647"
     ""                   
     ""                   
     ""                   
     ""                   
[1] "c"
   2.5%     50%   97.5% 
-0.9841 -0.9291 -0.8171 
     [,1]                  
R    "-0.912414835752265"  
EE   "0.0584140916561054"  
Tval "-15.6197727275093"   
Pval "4.36064073690113e-06"
     ""                    
     ""                    
     ""                    
     ""                    
[1] "d"
    2.5%      50%    97.5% 
-0.94535 -0.75984 -0.08255 
     [,1]                
R    "-0.746349487195883"
EE   "0.231362695525531" 
Tval "-3.22588516485158" 
Pval "0.0180033942242703"
     ""                  
     ""                  
     ""                  
     ""                  
[1] "e"
   2.5%     50%   97.5% 
-0.9938 -0.8736 -0.5323 
     [,1]                 
R    "-0.84750354141385"  
EE   "0.142265991711643"  
Tval "-5.95717592951972"  
Pval "0.00100144304616023"
     ""                   
     ""                   
     ""                   
     ""                   
[1] "f"
   2.5%     50%   97.5% 
-0.9821 -0.6554  0.2063 
     [,1]                
R    "-0.63181561626907" 
EE   "0.320262242610973" 
Tval "-1.9728070693508"  
Pval "0.0959760501278811"
     ""                  
     ""                  
     ""                  
     ""                  
[1] "g"
   2.5%     50%   97.5% 
-0.9955 -0.8124 -0.2500 
     [,1]                
R    "-0.777290926660661"
EE   "0.219758176528262" 
Tval "-3.53702846893025" 
Pval "0.0240777013764844"
     ""                  
     ""                  
     ""                  
     ""                  



CORCOEF <- numeric()
for(j in LEV){
  for(i in 1:100){
    PROD_REG_X <- productivity[region== j]
    MAM_REG_X <- mammals[region== j]
    MALATE <- sample(1:length(PROD_REG_X), replace=TRUE)
    CORCOEF[i] <- cor.test(PROD_REG_X[MALATE], MAM_REG_X[MALATE])$estimate
}
  }
Warning: NaNs produced
Warning: NaNs produced
Warning: NaNs produced
Warning: NaNs produced
Warning: NaNs produced
Warning: NaNs produced


#

###REGERSIÓN Y ~ X
M <- lm(mammals ~ productivity)
summary(M)

Call:
lm(formula = mammals ~ productivity)

Residuals:
   Min     1Q Median     3Q    Max 
-8.669 -3.686 -0.275  3.431 12.798 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)    -0.056      1.531   -0.04     0.97    
productivity    0.766      0.102    7.52  7.3e-10 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

Residual standard error: 4.9 on 52 degrees of freedom
Multiple R-squared: 0.521,  Adjusted R-squared: 0.512 
F-statistic: 56.6 on 1 and 52 DF,  p-value: 7.27e-10 
AIC(M)
[1] 328.9

plot(mammals ~ productivity)

plot of chunk unnamed-chunk-1

plot(productivity, mammals)
abline(-0.05604, 0.76611, col ="red", lty = 2)
abline(M, col ="blue", lty = 1)


predict(M, data.frame(productivity = 0.90), se=TRUE)
$fit
     1 
0.6335 

$se.fit
[1] 1.449

$df
[1] 52

$residual.scale
[1] 4.9

SEQ <- seq(0, 30, length=50)
PRED <- predict(M, data.frame(productivity = SEQ), se=TRUE)

QT <-qt(0.975, 52)

LIIC95 <- PRED$fit - PRED$se.fit * QT
LSIC95 <- PRED$fit + PRED$se.fit * QT

lines(SEQ, LIIC95, lty = 2, col = "darkred")
lines(SEQ, LSIC95, lty = 2, col = "darkred")

plot of chunk unnamed-chunk-1



par(mfrow=c(2,2))
plot(M)

plot of chunk unnamed-chunk-1