Clase 11-May-2023

library(readxl)
databan <- read_excel('Clase 11.05.2023/Data_ModeloAF_2022.xlsx',
                      sheet = "alo2")
head(databan)

## # A tibble: 6 × 13
##     A10   A20   A30   A40   A50   A60   A70 A_1_3LCL A1_2LCL  ALCL  Kumar     B
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>    <dbl>   <dbl> <dbl>  <dbl> <dbl>
## 1  207.  140.  103.  84.6  65.5  51.3  44.3     65.5    44.3  24.4 0.0747 0.167
## 2  250.  191.  154. 127.  109.   92.0  76.5     92.0    62.4  55.9 0.118  0.362
## 3  286.  207.  168. 134.  112.   92.0  81.5    112.     64.6  58.0 0.274  0.522
## 4  191.  151.  134. 103.   86.7  71.6  62.4     86.7    62.4  62.4 0.235  0.530
## 5  277.  188.  154. 131.  103.   81.5  68.3    103.     60.2  42.1 0.426  0.670
## 6  413.  219.  176. 151.  112.   97.5  79.0    112.     73.5  71.6 1.26   1.54 
## # ℹ 1 more variable: DM <dbl>

require(stats)
pc <- prcomp(x = databan,
             center = TRUE, 
             scale. = TRUE)
print(pc)

## Standard deviations (1, .., p=13):
##  [1] 3.53709429 0.51086070 0.31610830 0.18581100 0.17465942 0.13425889
##  [7] 0.11619086 0.10458127 0.08757657 0.07332399 0.06654957 0.04286862
## [13] 0.03540655
## 
## Rotation (n x k) = (13 x 13):
##                PC1         PC2         PC3         PC4         PC5         PC6
## A10      0.2642267 -0.63375048 -0.34530470 -0.32850723 -0.28944164  0.42516216
## A20      0.2755683 -0.39130416 -0.03333290  0.06618973  0.29850562 -0.40776312
## A30      0.2785656 -0.25022949 -0.03246763  0.39230612  0.29518435 -0.11223955
## A40      0.2813391 -0.11934981  0.01169124  0.22276525  0.01029357 -0.10900723
## A50      0.2818247 -0.04763707  0.05287199  0.25755903 -0.03086914 -0.08786534
## A60      0.2806135  0.07949373  0.22522288  0.18019393 -0.30409794  0.08249630
## A70      0.2805758  0.08004135  0.23943366  0.16431181 -0.28891962  0.12337283
## A_1_3LCL 0.2782607  0.04619014  0.38439008 -0.31067432 -0.46632161 -0.28154887
## A1_2LCL  0.2783192  0.12981410  0.41537424 -0.13453452  0.34261475  0.17817216
## ALCL     0.2792227  0.09548626  0.14840484 -0.52443595  0.47241764  0.21563166
## Kumar    0.2753299  0.29762498 -0.42986651 -0.28072798 -0.06441458 -0.47424279
## B        0.2751223  0.34607099 -0.42376437 -0.02720779 -0.01044230 -0.00127710
## DM       0.2761327  0.34196263 -0.24685327  0.29861178  0.02641507  0.46510140
##                    PC7         PC8         PC9        PC10         PC11
## A10      -0.0467366878  0.05913596 -0.13435564  0.08646792 -0.033963004
## A20      -0.0003311553  0.46599391  0.43729545  0.01648874 -0.005556239
## A30       0.2016813121 -0.51337148  0.02569906  0.35111015 -0.216357039
## A40       0.1564969506 -0.18671744 -0.45849705 -0.41228775  0.317399879
## A50      -0.2041917820  0.15253134 -0.22703071 -0.44719248  0.030500303
## A60      -0.3486823010 -0.24319674  0.05311498  0.42665266  0.195566684
## A70      -0.3498031950  0.07177027  0.41748762 -0.16552372  0.072632077
## A_1_3LCL  0.5653251441 -0.02956493  0.04995293 -0.03075034 -0.215353444
## A1_2LCL  -0.0515932370  0.43115589 -0.46021797  0.35879137 -0.047405108
## ALCL     -0.0745307063 -0.39538334  0.26131897 -0.29286223  0.101200927
## Kumar    -0.3996001034 -0.09002008 -0.20111863  0.05881064 -0.333558921
## B         0.3238117183  0.14175943  0.11461831  0.21779247  0.623948860
## DM        0.2338151849  0.14859567  0.12252210 -0.15245594 -0.497363915
##                 PC12        PC13
## A10       0.05178697 -0.04252076
## A20      -0.22537450  0.22494550
## A30       0.27357459 -0.23511452
## A40       0.05946857  0.54865262
## A50      -0.22961414 -0.68583025
## A60      -0.55432032  0.14616907
## A70       0.62642134  0.07044133
## A_1_3LCL -0.04338115 -0.08527045
## A1_2LCL   0.18543802  0.01405947
## ALCL     -0.13639832 -0.03741312
## Kumar     0.10505481  0.09473437
## B         0.09315148 -0.20150773
## DM       -0.20287954  0.19014216

summary(pc)

## Importance of components:
##                           PC1     PC2     PC3     PC4     PC5     PC6     PC7
## Standard deviation     3.5371 0.51086 0.31611 0.18581 0.17466 0.13426 0.11619
## Proportion of Variance 0.9624 0.02008 0.00769 0.00266 0.00235 0.00139 0.00104
## Cumulative Proportion  0.9624 0.98246 0.99015 0.99280 0.99515 0.99654 0.99758
##                            PC8     PC9    PC10    PC11    PC12    PC13
## Standard deviation     0.10458 0.08758 0.07332 0.06655 0.04287 0.03541
## Proportion of Variance 0.00084 0.00059 0.00041 0.00034 0.00014 0.00010
## Cumulative Proportion  0.99842 0.99901 0.99942 0.99976 0.99990 1.00000

plot(pc)

screeplot(pc, type = "line", main = "Scree plot")


library(devtools)

## Warning: package 'devtools' was built under R version 4.2.3

## Loading required package: usethis

## Warning: package 'usethis' was built under R version 4.2.3

install_github("vqv/ggbiplot")

## WARNING: Rtools is required to build R packages, but is not currently installed.
## 
## Please download and install Rtools 4.2 from https://cran.r-project.org/bin/windows/Rtools/ or https://www.r-project.org/nosvn/winutf8/ucrt3/.

## Skipping install of 'ggbiplot' from a github remote, the SHA1 (7325e880) has not changed since last install.
##   Use `force = TRUE` to force installation

library(ggbiplot)

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 4.2.3

## Loading required package: plyr

## Loading required package: scales

## Loading required package: grid

ggbiplot(pc)

library(ggbiplot)
data(databan)

## Warning in data(databan): data set 'databan' not found

ban.pca <- prcomp(databan, scale. = TRUE)
ggbiplot(ban.pca, obs.scale = 1, var.scale = 1,
         ellipse = TRUE, circle = TRUE) +
  scale_color_discrete(name = '') +
  theme(legend.direction = 'horizontal', legend.position = 'top')

biplot2 = ggbiplot(pcobj = ban.pca,alpha = 0.2,varname.adjust = T,
                  choices = c(1,2),
                  obs.scale = 1, var.scale = 1,  # Scaling of axis
                  #labels = row.names(databan),     # Add labels as rownames
                  labels.size = 3,
                  varname.size = 2,
                  varname.abbrev = TRUE,  # Abbreviate variable names (TRUE)
                  var.axes = T,      # Remove variable vectors (TRUE)
                  circle = T,        # Add unit variance circle (TRUE)
                  ellipse = TRUE)+ # Adding ellipses
  scale_color_discrete(name = '')+
  theme(legend.direction = 'horizontal', legend.position = 'top')
print(biplot2)

biplot3 = biplot2 + theme(legend.position = "bottom")
print(biplot3)

theme1 = biplot3 + theme_bw()
theme2 = biplot3 + theme_classic()
theme3 = biplot3 + theme_dark()
theme4 = biplot3 + theme_gray()
theme5 = biplot3 + theme_minimal()
theme6 = biplot3 + theme_void()

Leaf area prediction models from growth measurements in Andean Blueberry in the nursery. (Agraz) Artículo de banano

library(readxl)
databan <- read_excel('Clase 11.05.2023/Data_ModeloAF_2022.xlsx',
                      sheet = "alo2")
head(databan)

## # A tibble: 6 × 13
##     A10   A20   A30   A40   A50   A60   A70 A_1_3LCL A1_2LCL  ALCL  Kumar     B
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>    <dbl>   <dbl> <dbl>  <dbl> <dbl>
## 1  207.  140.  103.  84.6  65.5  51.3  44.3     65.5    44.3  24.4 0.0747 0.167
## 2  250.  191.  154. 127.  109.   92.0  76.5     92.0    62.4  55.9 0.118  0.362
## 3  286.  207.  168. 134.  112.   92.0  81.5    112.     64.6  58.0 0.274  0.522
## 4  191.  151.  134. 103.   86.7  71.6  62.4     86.7    62.4  62.4 0.235  0.530
## 5  277.  188.  154. 131.  103.   81.5  68.3    103.     60.2  42.1 0.426  0.670
## 6  413.  219.  176. 151.  112.   97.5  79.0    112.     73.5  71.6 1.26   1.54 
## # ℹ 1 more variable: DM <dbl>

dim(databan)

## [1] 34 13

library(ggcorrplot)

## Warning: package 'ggcorrplot' was built under R version 4.2.3

R=cor(databan)
ggcorrplot(R,lab=T)

require(stats)
pc <- prcomp(x = databan,
             center = TRUE, 
             scale. = TRUE)
print(pc)

## Standard deviations (1, .., p=13):
##  [1] 3.53709429 0.51086070 0.31610830 0.18581100 0.17465942 0.13425889
##  [7] 0.11619086 0.10458127 0.08757657 0.07332399 0.06654957 0.04286862
## [13] 0.03540655
## 
## Rotation (n x k) = (13 x 13):
##                PC1         PC2         PC3         PC4         PC5         PC6
## A10      0.2642267 -0.63375048 -0.34530470 -0.32850723 -0.28944164  0.42516216
## A20      0.2755683 -0.39130416 -0.03333290  0.06618973  0.29850562 -0.40776312
## A30      0.2785656 -0.25022949 -0.03246763  0.39230612  0.29518435 -0.11223955
## A40      0.2813391 -0.11934981  0.01169124  0.22276525  0.01029357 -0.10900723
## A50      0.2818247 -0.04763707  0.05287199  0.25755903 -0.03086914 -0.08786534
## A60      0.2806135  0.07949373  0.22522288  0.18019393 -0.30409794  0.08249630
## A70      0.2805758  0.08004135  0.23943366  0.16431181 -0.28891962  0.12337283
## A_1_3LCL 0.2782607  0.04619014  0.38439008 -0.31067432 -0.46632161 -0.28154887
## A1_2LCL  0.2783192  0.12981410  0.41537424 -0.13453452  0.34261475  0.17817216
## ALCL     0.2792227  0.09548626  0.14840484 -0.52443595  0.47241764  0.21563166
## Kumar    0.2753299  0.29762498 -0.42986651 -0.28072798 -0.06441458 -0.47424279
## B        0.2751223  0.34607099 -0.42376437 -0.02720779 -0.01044230 -0.00127710
## DM       0.2761327  0.34196263 -0.24685327  0.29861178  0.02641507  0.46510140
##                    PC7         PC8         PC9        PC10         PC11
## A10      -0.0467366878  0.05913596 -0.13435564  0.08646792 -0.033963004
## A20      -0.0003311553  0.46599391  0.43729545  0.01648874 -0.005556239
## A30       0.2016813121 -0.51337148  0.02569906  0.35111015 -0.216357039
## A40       0.1564969506 -0.18671744 -0.45849705 -0.41228775  0.317399879
## A50      -0.2041917820  0.15253134 -0.22703071 -0.44719248  0.030500303
## A60      -0.3486823010 -0.24319674  0.05311498  0.42665266  0.195566684
## A70      -0.3498031950  0.07177027  0.41748762 -0.16552372  0.072632077
## A_1_3LCL  0.5653251441 -0.02956493  0.04995293 -0.03075034 -0.215353444
## A1_2LCL  -0.0515932370  0.43115589 -0.46021797  0.35879137 -0.047405108
## ALCL     -0.0745307063 -0.39538334  0.26131897 -0.29286223  0.101200927
## Kumar    -0.3996001034 -0.09002008 -0.20111863  0.05881064 -0.333558921
## B         0.3238117183  0.14175943  0.11461831  0.21779247  0.623948860
## DM        0.2338151849  0.14859567  0.12252210 -0.15245594 -0.497363915
##                 PC12        PC13
## A10       0.05178697 -0.04252076
## A20      -0.22537450  0.22494550
## A30       0.27357459 -0.23511452
## A40       0.05946857  0.54865262
## A50      -0.22961414 -0.68583025
## A60      -0.55432032  0.14616907
## A70       0.62642134  0.07044133
## A_1_3LCL -0.04338115 -0.08527045
## A1_2LCL   0.18543802  0.01405947
## ALCL     -0.13639832 -0.03741312
## Kumar     0.10505481  0.09473437
## B         0.09315148 -0.20150773
## DM       -0.20287954  0.19014216

summary(pc)

## Importance of components:
##                           PC1     PC2     PC3     PC4     PC5     PC6     PC7
## Standard deviation     3.5371 0.51086 0.31611 0.18581 0.17466 0.13426 0.11619
## Proportion of Variance 0.9624 0.02008 0.00769 0.00266 0.00235 0.00139 0.00104
## Cumulative Proportion  0.9624 0.98246 0.99015 0.99280 0.99515 0.99654 0.99758
##                            PC8     PC9    PC10    PC11    PC12    PC13
## Standard deviation     0.10458 0.08758 0.07332 0.06655 0.04287 0.03541
## Proportion of Variance 0.00084 0.00059 0.00041 0.00034 0.00014 0.00010
## Cumulative Proportion  0.99842 0.99901 0.99942 0.99976 0.99990 1.00000

pc$x[,1]

##  [1] -5.5335811 -4.5879442 -4.3188115 -4.9399491 -4.5738055 -3.7614023
##  [7] -4.5324365 -3.0585592 -3.2970446 -3.1130556 -2.2289360 -2.2744136
## [13] -1.4186807 -1.8546635 -1.3327440 -1.1640201  0.1885050  0.3028724
## [19]  1.3339061  2.2483950  1.2297556 -0.3844374  1.2144815  2.6099049
## [25]  3.5319686  3.2292649  3.7573781  3.9880269  3.3171467  3.3843121
## [31]  5.7154023  5.1747054  5.6415551  5.5069043

plot(pc$x[,1:2],pch=16)

cor(pc$x[,1:2])

##              PC1          PC2
## PC1 1.000000e+00 1.100695e-14
## PC2 1.100695e-14 1.000000e+00

plot(pc)

screeplot(pc, type = "line", main = "Scree plot")

library(devtools)
install_github("vqv/ggbiplot")

## WARNING: Rtools is required to build R packages, but is not currently installed.
## 
## Please download and install Rtools 4.2 from https://cran.r-project.org/bin/windows/Rtools/ or https://www.r-project.org/nosvn/winutf8/ucrt3/.

## Skipping install of 'ggbiplot' from a github remote, the SHA1 (7325e880) has not changed since last install.
##   Use `force = TRUE` to force installation

library(ggbiplot)
ggbiplot(pc)

ggbiplot(pc,varname.size = 2)

#cor_pc1=pc$rotation[,1]
#cor_pc2=pc$rotation[,2]
#plot(0,0,xlim=c(-1.1),ylim=(-0.7,4),asa)

#plot(databan$A50,databan$DM,pch=16)

#ang=atan(cor_pc2/cor_pc1)*180/pi
#sort(abs(ang))

cor_pc1 = pc$rotation[,1]
cor_pc2 = pc$rotation[,2]
plot(0,0, xlim=c(-1,1), ylim=c(-0.7,0.4), asp=1)
arrows(rep(0, 13), rep(0, 13),
       cor_pc1, cor_pc2)
text(cor_pc1+runif(13, 0, 0.1), cor_pc2, names(cor_pc1))
ang = atan(cor_pc2/cor_pc1)*180/pi
sort(cor_pc1)

##       A10         B     Kumar       A20        DM  A_1_3LCL   A1_2LCL       A30 
## 0.2642267 0.2751223 0.2753299 0.2755683 0.2761327 0.2782607 0.2783192 0.2785656 
##      ALCL       A70       A60       A40       A50 
## 0.2792227 0.2805758 0.2806135 0.2813391 0.2818247

text(cor_pc1+runif(13, 0.2, 0.4), cor_pc2,
     paste0(round(ang, 2), '°'))

sort(abs(ang))

##  A_1_3LCL       A50       A60       A70      ALCL       A40   A1_2LCL       A30 
##  9.424925  9.594067 15.816658 15.922140 18.879238 22.987648 25.005367 41.932675 
##     Kumar        DM         B       A20       A10 
## 47.228391 51.079343 51.515723 54.845629 67.367501

plot(databan$A50, databan$DM, pch=16)

library(ggbiplot)
data(databan)

## Warning in data(databan): data set 'databan' not found

ban.pca <- prcomp(databan, scale. = TRUE)
ggbiplot(ban.pca, obs.scale = 1, var.scale = 1,
         ellipse = TRUE, circle = TRUE) +
  scale_color_discrete(name = '') +
  theme(legend.direction = 'horizontal', legend.position = 'top')

biplot2 = ggbiplot(pcobj = ban.pca,alpha = 0.2,varname.adjust = T,
                  choices = c(1,2),
                  obs.scale = 1, var.scale = 1,  # Scaling of axis
                  #labels = row.names(databan),     # Add labels as rownames
                  labels.size = 3,
                  varname.size = 2,
                  varname.abbrev = TRUE,  # Abbreviate variable names (TRUE)
                  var.axes = T,      # Remove variable vectors (TRUE)
                  circle = T,        # Add unit variance circle (TRUE)
                  ellipse = TRUE)+ # Adding ellipses
  scale_color_discrete(name = '')+
  theme(legend.direction = 'horizontal', legend.position = 'top')
print(biplot2)

biplot3 = biplot2 + theme(legend.position = "bottom")
print(biplot3)

theme1 = biplot3 + theme_bw()
theme2 = biplot3 + theme_classic()
theme3 = biplot3 + theme_dark()
theme4 = biplot3 + theme_gray()
theme5 = biplot3 + theme_minimal()
theme6 = biplot3 + theme_void()

2 dimensiones muestran el 98% de los datos, (Cumulative proportion)

Encontrar el ángulo permite detectar la mejor variable represnetada en el PC1 Support Vector Machine LV, determina cual es la variable que se asocia más a la mortalidad de Bogotá

#Ahora sacamos el nuevo modelo usando solo materia seca y A50
mod1=lm(DM~A50,databan)
summary(mod1)

## 
## Call:
## lm(formula = DM ~ A50, data = databan)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.6676 -1.0634 -0.0506  1.1241  3.2293 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -4.372811   0.698360  -6.262  5.1e-07 ***
## A50          0.052451   0.002341  22.404  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.784 on 32 degrees of freedom
## Multiple R-squared:  0.9401, Adjusted R-squared:  0.9382 
## F-statistic: 501.9 on 1 and 32 DF,  p-value: < 2.2e-16

#DM=-4.37+0.05*A50
pred=mod1$fitted.values
plot(databan$DM,pred,pch=16)

cor(databan$DM,pred)

## [1] 0.9695714

cor(databan$DM,pred)^2

## [1] 0.9400687

cor(databan$DM,pred)**2

## [1] 0.9400687

library(dplyr)

## Warning: package 'dplyr' was built under R version 4.2.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

set.seed(123)
data2 = databan %>% 
  mutate(id = seq(nrow(databan)),
         pred = pred)

data_train = data2 %>% 
  sample_frac(0.7)

data_test = data2 %>% 
  filter(!id %in% data_train$id)

mod2 = lm(DM ~ A50, data_train)
summary(mod2)

## 
## Call:
## lm(formula = DM ~ A50, data = data_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.1577 -1.2208  0.0675  1.4601  2.7815 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -4.613740   0.876939  -5.261 2.81e-05 ***
## A50          0.054438   0.002857  19.053 3.66e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.83 on 22 degrees of freedom
## Multiple R-squared:  0.9429, Adjusted R-squared:  0.9403 
## F-statistic:   363 on 1 and 22 DF,  p-value: 3.662e-15

pred_train = mod2$fitted.values
plot(data_train$DM, pred_train,
     pch=16, cex=2)

pred_test = predict(mod2, data_test)
plot(data_test$DM, pred_test,
     pch=16, cex=2)

cor(data_test$DM, pred_test)**2

## [1] 0.966838

Machine Learning Debe ajustar los datos a lo que se solicita Discriminiant analysis for estimating meristematic differentiation point based on banana

Clase 11-May-2023

JRV

2023-05-11