library(openxlsx)
library(tidyverse)
library(factoextra)
library(cowplot)
library(ggpubr)
library(cluster)
library(purrr)
library(dplyr)
Libros <- read.xlsx("/Users/eduardoleyva/Desktop/Diario/Top_100_Trending_Books.xlsx")
summary(Libros)
## Rank book.title book.price rating
## Min. : 1.00 Length:100 Min. : 2.780 Min. :4.10
## 1st Qu.: 25.75 Class :character 1st Qu.: 6.303 1st Qu.:4.60
## Median : 50.50 Mode :character Median :11.480 Median :4.70
## Mean : 50.50 Mean :12.709 Mean :4.69
## 3rd Qu.: 75.25 3rd Qu.:16.990 3rd Qu.:4.80
## Max. :100.00 Max. :48.770 Max. :5.00
## NA's :3
## author year.of.publication genre url
## Length:100 Min. :1947 Length:100 Length:100
## Class :character 1st Qu.:2014 Class :character Class :character
## Mode :character Median :2019 Mode :character Mode :character
## Mean :2014
## 3rd Qu.:2023
## Max. :2024
##
numeric_columns <- Libros[sapply(Libros, is.numeric)]
Escalar <- scale(numeric_columns, center = TRUE, scale = TRUE)
Escalar <- as.data.frame(Escalar)
Escalar <- Escalar[complete.cases(Escalar), ]
str(Escalar)
## 'data.frame': 97 obs. of 4 variables:
## $ Rank : num -1.71 -1.67 -1.64 -1.6 -1.57 ...
## $ book.price : num 0.722 1.039 2.374 1.425 -0.892 ...
## $ rating : num -3.256 -1.047 -1.047 -1.599 0.609 ...
## $ year.of.publication: num 0.611 0.611 0.611 0.611 0.28 ...
total_within = function(n_clusters, data, iter.max=1000, nstart=50){
cluster_means = kmeans(data,centers = n_clusters,
iter.max = iter.max,
nstart = nstart)
return(cluster_means$tot.withinss)
}
total_withinss <- map_dbl(.x = 1:15,
.f = total_within,
data = Escalar)
total_withinss
## [1] 388.01141 286.64929 224.51635 170.23330 135.40300 114.27131 97.98697
## [8] 87.25978 77.25084 68.97922 63.33957 58.20390 53.40197 48.97433
## [15] 45.90609
data.frame(n_clusters = 1:15, suma_cuadrados_internos = total_withinss) %>%
ggplot(aes(x = n_clusters, y = suma_cuadrados_internos)) +
geom_line() +
geom_point() +
scale_x_continuous(breaks = 1:15) +
labs(title = "Suma total de cuadrados intra-cluster") +
theme_bw()
kmcluster = kmeans(Escalar,centers=4,nstart = 50)
kmcluster
## K-means clustering with 4 clusters of sizes 32, 40, 15, 10
##
## Cluster means:
## Rank book.price rating year.of.publication
## 1 1.0534618 0.2557650 -0.1846194 0.2492484
## 2 -0.4903229 -0.4272087 0.4847771 0.2356170
## 3 -1.0352220 1.0399000 -1.4152967 0.4917224
## 4 0.1240888 -0.9026010 0.7746188 -2.6145882
##
## Clustering vector:
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 3 3 3 3 2 2 3 2 3 3 2 4 2 2 2 3 3 3 2 2
## 21 22 23 24 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
## 2 2 2 3 2 2 2 2 4 4 2 2 3 2 4 2 2 2 2 2
## 42 43 44 45 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62
## 2 2 4 2 3 2 2 2 3 2 2 2 2 4 2 1 1 2 1 2
## 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
## 2 3 1 1 2 4 2 1 1 1 1 1 1 4 1 1 1 1 1 1
## 83 84 85 87 88 89 90 91 92 93 94 95 96 97 98 99 100
## 1 1 1 1 1 1 1 4 1 1 1 1 1 4 1 1 1
##
## Within cluster sum of squares by cluster:
## [1] 60.03961 47.44112 42.32306 20.42951
## (between_SS / total_SS = 56.1 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
fviz_cluster(kmcluster, Escalar)+
theme_minimal()
Escalar <- na.omit(Escalar)
fviz_cluster(kmcluster, Escalar, show.clust.cent = T,
ellipse.type = "euclid", star.plot = T, repel = T) +
labs(title = "Resultados clustering K-means") +
theme_bw()
Libros2 <- dist(Libros, method = 'euclidean') #Sacamos la distancia euclidiana de los puntos
Libros3 <- hclust(Libros2, method = 'average') #Hacemos el análisis de clusters
plot(Libros3, cex=0.5, col="red", hang = -1,
main="Dendograma, Distancia Euclídea, Método completo")
rect.hclust(Libros3, k = 4, border = c("blue", "green", "purple", "orange"))
library(car)
library(agricolae)
Pilotos <- read.xlsx("/Users/eduardoleyva/Desktop/Diario/F1DriversDataset.xlsx")
summary(Pilotos)
## Driver Nationality Seasons Championships
## Length:868 Length:868 Length:868 Min. :0.0000
## Class :character Class :character Class :character 1st Qu.:0.0000
## Mode :character Mode :character Mode :character Median :0.0000
## Mean :0.0841
## 3rd Qu.:0.0000
## Max. :7.0000
## Race_Entries Race_Starts Pole_Positions Race_Wins
## Min. : 1.00 Min. : 0.00 Min. : 0.000 Min. : 0.000
## 1st Qu.: 2.00 1st Qu.: 1.00 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 7.00 Median : 5.00 Median : 0.000 Median : 0.000
## Mean : 29.92 Mean : 27.69 Mean : 1.244 Mean : 1.248
## 3rd Qu.: 29.25 3rd Qu.: 26.00 3rd Qu.: 0.000 3rd Qu.: 0.000
## Max. :359.00 Max. :356.00 Max. :103.000 Max. :103.000
## Podiums Fastest_Laps Points Active
## Min. : 0.000 Min. : 0.000 Min. : 0.00 Mode :logical
## 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.00 FALSE:848
## Median : 0.000 Median : 0.000 Median : 0.00 TRUE :20
## Mean : 3.757 Mean : 1.262 Mean : 55.85
## 3rd Qu.: 0.000 3rd Qu.: 0.000 3rd Qu.: 8.00
## Max. :191.000 Max. :77.000 Max. :4415.50
## Championship.Years Decade Pole_Rate Start_Rate
## Length:868 Min. :1950 Min. :0.00000 Min. :0.0000
## Class :character 1st Qu.:1960 1st Qu.:0.00000 1st Qu.:0.6667
## Mode :character Median :1970 Median :0.00000 Median :0.9623
## Mean :1972 Mean :0.01147 Mean :0.7798
## 3rd Qu.:1982 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :2020 Max. :0.55769 Max. :1.0000
## Win_Rate Podium_Rate FastLap_Rate Points_Per_Entry
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. : 0.0000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.: 0.0000
## Median :0.00000 Median :0.00000 Median :0.00000 Median : 0.0000
## Mean :0.01105 Mean :0.04139 Mean :0.01189 Mean : 0.4792
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.: 0.3825
## Max. :0.46154 Max. :1.00000 Max. :0.50000 Max. :14.1977
## Years_Active Champion
## Min. : 1.000 Mode :logical
## 1st Qu.: 1.000 FALSE:834
## Median : 2.000 TRUE :34
## Mean : 3.665
## 3rd Qu.: 5.000
## Max. :19.000
anova_model <- aov(Podiums ~ Nationality, data = Pilotos)
summary(anova_model)
## Df Sum Sq Mean Sq F value Pr(>F)
## Nationality 46 18750 407.6 2.068 5.98e-05 ***
## Residuals 821 161852 197.1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(foreign)
library(ggplot2)
library(MASS)
library(openxlsx)
library(readxl)
ENEC <- read_excel("/Users/eduardoleyva/Desktop/Diario/ENEC.xlsx", col_types = c("text",
"numeric", "numeric", "numeric", "numeric"))
head(ENEC)
## # A tibble: 6 × 5
## ENTIDAD HORAS_OBREROS HORAS_EMPLEADOS HORAS_PROPIETARIOS INDICE
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Nayarit 705494 204837 77732 81482
## 2 Guanajuato 692601 8554472992 10668963167 8222532996
## 3 Nayarit 646566 211751 99044 77542
## 4 Guerrero 630395 489. 176783 54813
## 5 Guerrero 628205 274508 165622 101644
## 6 Guerrero 602451 238038 197676 117398
dis=lda(ENTIDAD~INDICE+HORAS_OBREROS+HORAS_EMPLEADOS+HORAS_PROPIETARIOS, data=ENEC)
dis
## Call:
## lda(ENTIDAD ~ INDICE + HORAS_OBREROS + HORAS_EMPLEADOS + HORAS_PROPIETARIOS,
## data = ENEC)
##
## Prior probabilities of groups:
## Aguascalientes Baja California
## 0.02920792 0.03267327
## Baja California Sur Campeche
## 0.03267327 0.03168317
## Chiapas Chihuahua
## 0.03341584 0.03193069
## Ciudad de México Coahuila de Zaragoza
## 0.03193069 0.03242574
## Colima Durango
## 0.03391089 0.02970297
## Guanajuato Guerrero
## 0.03143564 0.03292079
## Hidalgo Jalisco
## 0.03242574 0.03094059
## Michoacán de Ocampo Morelos
## 0.03292079 0.03316832
## Nayarit Nuevo León
## 0.03217822 0.03217822
## Oaxaca Puebla
## 0.03217822 0.03168317
## Querétaro de Arteaga Quintana Roo
## 0.03341584 0.03267327
## San Luis Potosí Sinaloa
## 0.03267327 0.03193069
## Sonora Tabasco
## 0.03316832 0.03143564
## Tamaulipas Tlaxcala
## 0.03168317 0.03143564
## Veracruz de Ignacio de la Llave Yucatán
## 0.03341584 0.03267327
## Zacatecas
## 0.03391089
##
## Group means:
## INDICE HORAS_OBREROS HORAS_EMPLEADOS
## Aguascalientes 77860.72 105155.58 82850.27
## Baja California 81191.06 93750.23 87525.80
## Baja California Sur 88548.97 173545.57 92065.09
## Campeche 92868.00 134476.09 100510.97
## Chiapas 69198.09 233271.01 187121.89
## Chihuahua 72274.78 81233.64 57683.03
## Ciudad de México 81525.55 74468.93 101683.25
## Coahuila de Zaragoza 106155.83 178483.89 133400.88
## Colima 86780.73 75500.18 59911.05
## Durango 87730.71 139766.05 85323.50
## Guanajuato 64838284.28 88932.72 67435719.38
## Guerrero 86847.98 287180.35 157280.28
## Hidalgo 99277.79 90684.74 75208.42
## Jalisco 74821.51 142283.87 121762.78
## Michoacán de Ocampo 85178.99 98731.65 96146.75
## Morelos 85242.00 218365.72 155148.83
## Nayarit 82321.07 157726.48 77283.35
## Nuevo León 75193.26 91584.89 69183.18
## Oaxaca 124639.85 260642.32 120400.75
## Puebla 75400.64 102458.88 56471.47
## Querétaro de Arteaga 71801.63 91706.85 75536.57
## Quintana Roo 63424.72 142573.14 102001.78
## San Luis Potosí 62347.97 65958.26 58356.64
## Sinaloa 93442.19 147459.59 166532.94
## Sonora 73574.30 94708.49 70911.58
## Tabasco 108795.68 111457.77 47494.52
## Tamaulipas 72786.08 117653.80 78012.09
## Tlaxcala 123216.46 74476.91 88917.06
## Veracruz de Ignacio de la Llave 97074.92 146655.90 101641.99
## Yucatán 93617.65 80941.08 71110.16
## Zacatecas 97068.01 55852.94 62169.72
## HORAS_PROPIETARIOS
## Aguascalientes 50719.95
## Baja California 84448.14
## Baja California Sur 43975.76
## Campeche 65305.26
## Chiapas 64330.86
## Chihuahua 105048.63
## Ciudad de México 54777.28
## Coahuila de Zaragoza 43161.54
## Colima 63181.21
## Durango 76372.09
## Guanajuato 84146914.12
## Guerrero 123794.51
## Hidalgo 86761.44
## Jalisco 92370.75
## Michoacán de Ocampo 87450.39
## Morelos 97758.66
## Nayarit 107487.31
## Nuevo León 87963.45
## Oaxaca 43509.95
## Puebla 89330.28
## Querétaro de Arteaga 48007.36
## Quintana Roo 131286.99
## San Luis Potosí 78331.12
## Sinaloa 86714.36
## Sonora 57788.13
## Tabasco 47561.82
## Tamaulipas 51609.41
## Tlaxcala 65690.34
## Veracruz de Ignacio de la Llave 76412.23
## Yucatán 74782.11
## Zacatecas 54490.23
##
## Coefficients of linear discriminants:
## LD1 LD2 LD3 LD4
## INDICE 5.451566e-07 1.616682e-05 1.643395e-05 -4.327628e-07
## HORAS_OBREROS 1.691965e-05 -4.972054e-06 8.717857e-06 1.045174e-07
## HORAS_EMPLEADOS 4.599227e-06 8.282946e-06 -1.908880e-05 -7.011807e-07
## HORAS_PROPIETARIOS -4.108916e-06 -1.910107e-05 2.639626e-06 8.897869e-07
##
## Proportion of trace:
## LD1 LD2 LD3 LD4
## 0.6561 0.2094 0.1308 0.0036
nuevo.dato=rbind(c(300000, 200000, 100000, 15500))
colnames(nuevo.dato)=colnames(ENEC[,2:5])
nuevo.dato=data.frame(nuevo.dato)
predict(dis,newdata = nuevo.dato)
## $class
## [1] Guerrero
## 31 Levels: Aguascalientes Baja California Baja California Sur ... Zacatecas
##
## $posterior
## Aguascalientes Baja California Baja California Sur Campeche Chiapas
## 1 0.0002302148 0.0002001718 0.003493726 0.001129843 0.3558781
## Chihuahua Ciudad de México Coahuila de Zaragoza Colima Durango
## 1 3.9444e-05 4.710735e-05 0.003589244 1.488488e-05 0.001251844
## Guanajuato Guerrero Hidalgo Jalisco Michoacán de Ocampo Morelos
## 1 3.35769e-05 0.3935484 4.357642e-05 0.009468375 0.000308059 0.204541
## Nayarit Nuevo León Oaxaca Puebla Querétaro de Arteaga
## 1 0.002628066 0.0001171913 0.002565438 0.0001212751 0.0001244475
## Quintana Roo San Luis Potosí Sinaloa Sonora Tabasco Tamaulipas
## 1 0.00682167 2.497309e-05 0.0109808 0.0001366346 1.115686e-05 0.0005746891
## Tlaxcala Veracruz de Ignacio de la Llave Yucatán Zacatecas
## 1 3.816119e-06 0.00204561 2.464499e-05 1.961374e-06
##
## $x
## LD1 LD2 LD3 LD4
## 1 3.26834 -1.572699 -1.623026 0.01099463
library("stats")
library("psych")
library("readxl")
library("MASS")
library("ISLR")
library("fRegression")
library("vcd")
library("openxlsx")
Base1 = read.xlsx("/Users/eduardoleyva/Desktop/Diario/Diabetes.xlsx")
modelo_logit1 = glm(Outcome~Glucose+BMI+Age,data=Base1,family=binomial(link="logit"))
summary(modelo_logit1)
##
## Call:
## glm(formula = Outcome ~ Glucose + BMI + Age, family = binomial(link = "logit"),
## data = Base1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3809 -0.7476 -0.4357 0.7861 2.8263
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.393743 0.666067 -12.602 < 2e-16 ***
## Glucose 0.032512 0.003329 9.767 < 2e-16 ***
## BMI 0.081590 0.013526 6.032 1.62e-09 ***
## Age 0.030157 0.007632 3.951 7.77e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 993.48 on 767 degrees of freedom
## Residual deviance: 755.68 on 764 degrees of freedom
## AIC: 763.68
##
## Number of Fisher Scoring iterations: 5
modelo_probit1 = glm(Outcome~Glucose+BMI+Age,data=Base1,family=binomial(link="probit"))
summary(modelo_probit1)
##
## Call:
## glm(formula = Outcome ~ Glucose + BMI + Age, family = binomial(link = "probit"),
## data = Base1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4228 -0.7664 -0.4341 0.8132 2.9832
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.881501 0.359373 -13.583 < 2e-16 ***
## Glucose 0.018553 0.001869 9.928 < 2e-16 ***
## BMI 0.047392 0.007693 6.160 7.27e-10 ***
## Age 0.018676 0.004476 4.172 3.01e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 993.48 on 767 degrees of freedom
## Residual deviance: 757.92 on 764 degrees of freedom
## AIC: 765.92
##
## Number of Fisher Scoring iterations: 5
CIA_Logit1 = AIC(modelo_logit1)
CIA_Probit1 = AIC(modelo_probit1)
CIA_Logit1
## [1] 763.6846
CIA_Probit1
## [1] 765.9199
log.odds <- predict(modelo_probit1, data.frame(BMI = 32, Glucose = 170, Age = 60))
Prob1 = pnorm(log.odds)
Prob1
## 1
## 0.8185115
predict(modelo_probit1, data.frame(BMI = 32, Glucose = 170, Age = 60), type="response")
## 1
## 0.8185115
Prob1*coef(modelo_probit1)
## (Intercept) Glucose BMI Age
## -3.99556429 0.01518618 0.03879106 0.01528675
predicciones1 <- ifelse(test = modelo_probit1$fitted.values > 0.5, yes =1, no = 0)
matriz_confusion1 <- table(predicciones1,modelo_probit1$model$Outcome,dnn = c( "predicciones","observaciones"))
matriz_confusion1
## observaciones
## predicciones 0 1
## 0 440 117
## 1 60 151
(439 + 127) / (439 + 141 + 61 + 127) * 100
## [1] 73.69792
library(neuralnet)
library(stats)
library(psych)
library(MASS)
library(ISLR)
library(fRegression)
library(vcd)
library(dplyr)
library(mlbench)
library(magrittr)
library(neuralnet)
library(keras)
library(caret)
Agua <- read.xlsx("/Users/eduardoleyva/Desktop/Evidencia 2/El_Mojarral_20082.xlsx")
str(Agua)
## 'data.frame': 4322 obs. of 4 variables:
## $ Fecha : num 39448 39448 39448 39448 39448 ...
## $ Hora : num 0 0.0417 0.0833 0.125 0.1667 ...
## $ Nivel : num 0.393 0.393 0.396 0.399 0.397 ...
## $ Temperatura: num 28.7 28.5 28.4 28.1 27.6 ...
colSums(is.na(Agua))
## Fecha Hora Nivel Temperatura
## 0 0 0 0
set.seed(13)
train = createDataPartition(y = Agua$Temperatura, p=0.7,list=FALSE, times = 1)
Agua_train = Agua[train,]
Agua_test = Agua[-train,]
RN <- neuralnet(Temperatura ~ Fecha+Hora+Nivel,
data = Agua_train,
hidden = c(8,6),
linear.output = T,
lifesign = 'full',
threshold = 0.05,
rep=1)
#train/test split en matrices y separando variable a predecir
training <- as.matrix(Agua_train[, c(1, 2, 3)])
trainingtarget <- as.matrix(Agua_train[,4])
test <- as.matrix(Agua_test[, c(1, 2, 3)])
testtarget <- as.matrix(Agua_test[,4])
#Estandarización de variables
m <- colMeans(training) #Obtener medias por columna
s <- apply(training, 2, sd) #Calcular StandDev por columna (por ello el apply lleva el 2, si pusieran 1 sería por renglón)
training <- scale(training, center = m, scale = s)
test <- scale(test, center = m, scale = s)
Agua_train_S <- as.data.frame(cbind(training,(trainingtarget - mean(trainingtarget))/sd(trainingtarget)))
colnames(Agua_train_S) <- colnames(Agua_train)
RNS <- neuralnet(Temperatura ~ Fecha+Hora+Nivel,
data = Agua_train_S,
hidden = c(8,6),
linear.output = T,
#linear.output se debe poner como T en modelos de regresion y como F en modelos de clasificación
lifesign = 'full',
rep=1,
# threshold=0.02,
stepmax=4000000)
plot(RNS,col.hidden = 'darkgreen',
col.hidden.synapse = 'darkgreen',
show.weights = T,
information = F,
fill = 'lightblue')
Agua_test_S <- as.data.frame(test)
colnames(Agua_test_S) <- colnames(Agua_test)[1:3]
RNSPredictions <- predict(RNS,Agua_test_S)
cor(RNSPredictions,(testtarget-mean(trainingtarget))/sd(trainingtarget))
## [,1]
## [1,] 0.9872836
RNSPred <- RNSPredictions*sd(trainingtarget) + mean(trainingtarget)
plot(RNSPred,testtarget)
abline(a=0, b=1)
RSSnn <- (RNSPred - testtarget)^2
sum(RSSnn)/nrow(testtarget)
## [1] 0.01570494
1 - sum(RSSnn)/sum((testtarget - mean(trainingtarget))^2)
## [1] 0.9747231
LRM <- lm(Temperatura ~ Fecha+Hora+Nivel, data=Agua_train)
LRMPred <- predict(LRM, Agua_test[, c(1, 2, 3)])
cor(LRMPred,Agua_test[,4])
## [1] 0.7921928
plot(LRMPred,Agua_test[,4])
abline(a=0,b=1)
LRMRSS <- (LRMPred - Agua_test[,4])^2
sum(LRMRSS)/nrow(testtarget)
## [1] 0.2319171
1 - sum(LRMRSS)/sum((Agua_test[,4] - mean(Agua_train[, c(1, 2, 3)]))^2)
## [1] NA
par(mfrow=c(1,2))
plot(Agua_test$Temperatura,RNSPred,col='red',main='Real vs predicted NN',pch=19,cex=1)
abline(0,1,lwd=2)
legend('bottomright',legend='NN',pch=18,col='red', bty='n')
plot(Agua_test$Temperatura,LRMPred,col='blue',main='Real vs predicted LM',pch=15, cex=1)
abline(0,1,lwd=2)
legend('bottomright',legend='LM',pch=18,col='blue', bty='n', cex=.95)
plot(Agua_test$Temperatura,RNSPred,col='red',main='Real vs predicted NN',pch=19,cex=1)
points(Agua_test$Temperatura,LRMPred,col='blue',pch=15,cex=1)
abline(0,1,lwd=2)
legend('bottomright',legend=c('NN','LM'),pch=c(19,15),col=c('red','blue'))