The frist thing we do is instrall the readr pakage with install.packages(“readr”) Then, we open the package with library(readr)
library(readr)
Once we have the library installed we call the directory with the command
carsDataSet<- read.csv(file="C:/Users/Alfonso/Desktop/Ubiqum/Bloque_2/Tarea_1/R Tutorial Data Sets/cars.csv")
Now lets see what we got there:
next we set the speed to numeric
carsDataSet$speed.of.car<-as.numeric(carsDataSet$speed.of.car)
and we change the atribute names
names(carsDataSet)<-c("model", "speed", "distance")
now we split the data in learning and testing sets
trainSize<-round(nrow(carsDataSet)*0.7)
testSize<-nrow(carsDataSet)-trainSize
set.seed(123)
training_indices<-sample(seq_len(nrow(carsDataSet)),size =trainSize)
trainSet<-carsDataSet[training_indices,]
testSet<-carsDataSet[-training_indices,]
and with this we generate the linear model
carsinfo<-lm(distance~ speed, trainSet)
and see the metrics of the model
summary(carsinfo)
##
## Call:
## lm(formula = distance ~ speed, data = trainSet)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.0012 -5.0012 -0.5603 2.1458 28.4109
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -35.2481 4.0712 -8.658 5.25e-10 ***
## speed 5.0735 0.2519 20.143 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.18 on 33 degrees of freedom
## Multiple R-squared: 0.9248, Adjusted R-squared: 0.9225
## F-statistic: 405.7 on 1 and 33 DF, p-value: < 2.2e-16
and we run the predictions
Predictionscars <- predict(carsinfo,testSet)
# 2.- Iris project ####
#library carga una libreria
library(readr)
IrisDataset <- read.csv("C:/Users/Alfonso/Desktop/Ubiqum/Bloque_2/Tarea_1/R Tutorial Data Sets/iris.csv")
Revisamos los datos que tenemos
attributes(IrisDataset)
## $names
## [1] "X" "Sepal.Length" "Sepal.Width" "Petal.Length"
## [5] "Petal.Width" "Species"
##
## $class
## [1] "data.frame"
##
## $row.names
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## [18] 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
## [35] 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
## [52] 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
## [69] 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
## [86] 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102
## [103] 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
## [120] 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136
## [137] 137 138 139 140 141 142 143 144 145 146 147 148 149 150
# resumen de los datos de la tabla
summary(IrisDataset)
## X Sepal.Length Sepal.Width Petal.Length
## Min. : 1.00 Min. :4.300 Min. :2.000 Min. :1.000
## 1st Qu.: 38.25 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600
## Median : 75.50 Median :5.800 Median :3.000 Median :4.350
## Mean : 75.50 Mean :5.843 Mean :3.057 Mean :3.758
## 3rd Qu.:112.75 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100
## Max. :150.00 Max. :7.900 Max. :4.400 Max. :6.900
## Petal.Width Species
## Min. :0.100 setosa :50
## 1st Qu.:0.300 versicolor:50
## Median :1.300 virginica :50
## Mean :1.199
## 3rd Qu.:1.800
## Max. :2.500
# indica los tipos de variables
str(IrisDataset)
## 'data.frame': 150 obs. of 6 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
names(IrisDataset)
## [1] "X" "Sepal.Length" "Sepal.Width" "Petal.Length"
## [5] "Petal.Width" "Species"
# indica los nombres de las cabeceras
plot(IrisDataset)
plot(IrisDataset$Sepal.Length)
# Normal Quantile Plot- is a way to see if your data is normally distributed.
qqnorm(IrisDataset$Sepal.Length)
Empezamos a trabajar el modelo
# Matriz.Campo/columna <- as.tipo_de_variable(matriz.campo_a_transformar/introducir)
IrisDataset$Species<- as.numeric(IrisDataset$Species)
#asignar semilla como numero 123
#probablemente para el sample???
set.seed(123)
Defimos el training y testing dataset
#round redondear
#nrow numero de filas en la matriz/vector
trainSize <- round(nrow(IrisDataset) * 0.8)
trainSize
## [1] 120
testSize <-nrow(IrisDataset) - trainSize
testSize
## [1] 30
#"size= " es redundante
#sample genera valores aleatorios (intervalo de valores, cantidad de valores(nunca mayor que el intervalo))
#training_indices<-sample(seq_len(nrow(carsDataSet)),sizie =tranSize)
#seq_len(x) genera una secuencia del 1 hasta x.
# tambien se podria escribir training_indices<-sample(1:150, 120)
training_indices<-sample(seq_len(nrow(IrisDataset)),size = trainSize)
#referenciar una matriz funciona nombre_matriz[numero_de_fila/s,numero_de_columna]
#si dejamos
trainSet <- IrisDataset[training_indices,]
#el "-" actua como una negacion es decir metemos los que no cumplan con los indices
testSet <- IrisDataset[-training_indices,]
Generamos el modelo de predicción
#we are predicting petal width from petal length metiendo los resultados en LienarModel
#impoortante recordar lm(variable dependiente ~ variable independiente, training dataset)
#es decir lm(y ~ f(x), donde aprende)
#tambien se puede definir una variable con la funcio y tulizar esa como #nombre
LinearModel<- lm(Petal.Width ~ Petal.Length,IrisDataset)
Comprobamos que tal es el modelo
#summary nos indica los errores y datos de interes
summary(LinearModel)
##
## Call:
## lm(formula = Petal.Width ~ Petal.Length, data = IrisDataset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.56515 -0.12358 -0.01898 0.13288 0.64272
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.363076 0.039762 -9.131 4.7e-16 ***
## Petal.Length 0.415755 0.009582 43.387 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2065 on 148 degrees of freedom
## Multiple R-squared: 0.9271, Adjusted R-squared: 0.9266
## F-statistic: 1882 on 1 and 148 DF, p-value: < 2.2e-16
predecimos los resultados
prediction<-predict(LinearModel,testSet)
prediction
## 1 2 3 11 18 19 28
## 0.2189821 0.2189821 0.1774065 0.2605576 0.2189821 0.3437087 0.2605576
## 33 36 48 55 56 57 58
## 0.2605576 0.1358310 0.2189821 1.5493994 1.5078239 1.5909749 1.0089174
## 59 61 62 65 66 70 77
## 1.5493994 1.0920684 1.3830972 1.1336440 1.4662483 1.2583706 1.6325505
## 83 84 98 100 105 113 125
## 1.2583706 1.7572771 1.4246728 1.3415217 2.0483059 1.9235793 2.0067304
## 131 141
## 2.1730325 1.9651548
#plot(x,Y) hace un scater graph
plot(prediction)
#hitograma (variable)
hist(prediction)