Task 1 R

1. CARS

The frist thing we do is instrall the readr pakage with install.packages(“readr”) Then, we open the package with library(readr)

library(readr)

Once we have the library installed we call the directory with the command

carsDataSet<- read.csv(file="C:/Users/Alfonso/Desktop/Ubiqum/Bloque_2/Tarea_1/R Tutorial Data Sets/cars.csv")

Now lets see what we got there:

next we set the speed to numeric

carsDataSet$speed.of.car<-as.numeric(carsDataSet$speed.of.car)

and we change the atribute names

names(carsDataSet)<-c("model", "speed", "distance")

now we split the data in learning and testing sets

trainSize<-round(nrow(carsDataSet)*0.7)
testSize<-nrow(carsDataSet)-trainSize
set.seed(123)
training_indices<-sample(seq_len(nrow(carsDataSet)),size =trainSize)
trainSet<-carsDataSet[training_indices,]
testSet<-carsDataSet[-training_indices,]

and with this we generate the linear model

carsinfo<-lm(distance~ speed, trainSet)

and see the metrics of the model

summary(carsinfo)

## 
## Call:
## lm(formula = distance ~ speed, data = trainSet)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.0012 -5.0012 -0.5603  2.1458 28.4109 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -35.2481     4.0712  -8.658 5.25e-10 ***
## speed         5.0735     0.2519  20.143  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.18 on 33 degrees of freedom
## Multiple R-squared:  0.9248, Adjusted R-squared:  0.9225 
## F-statistic: 405.7 on 1 and 33 DF,  p-value: < 2.2e-16

and we run the predictions

Predictionscars <- predict(carsinfo,testSet)

2. Iris

# 2.- Iris project ####
#library carga una libreria
library(readr)

IrisDataset <- read.csv("C:/Users/Alfonso/Desktop/Ubiqum/Bloque_2/Tarea_1/R Tutorial Data Sets/iris.csv")

Revisamos los datos que tenemos

attributes(IrisDataset)

## $names
## [1] "X"            "Sepal.Length" "Sepal.Width"  "Petal.Length"
## [5] "Petal.Width"  "Species"     
## 
## $class
## [1] "data.frame"
## 
## $row.names
##   [1]   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
##  [18]  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34
##  [35]  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51
##  [52]  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68
##  [69]  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83  84  85
##  [86]  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 101 102
## [103] 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
## [120] 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136
## [137] 137 138 139 140 141 142 143 144 145 146 147 148 149 150

# resumen de los datos de la tabla
summary(IrisDataset)

##        X           Sepal.Length    Sepal.Width     Petal.Length  
##  Min.   :  1.00   Min.   :4.300   Min.   :2.000   Min.   :1.000  
##  1st Qu.: 38.25   1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600  
##  Median : 75.50   Median :5.800   Median :3.000   Median :4.350  
##  Mean   : 75.50   Mean   :5.843   Mean   :3.057   Mean   :3.758  
##  3rd Qu.:112.75   3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100  
##  Max.   :150.00   Max.   :7.900   Max.   :4.400   Max.   :6.900  
##   Petal.Width          Species  
##  Min.   :0.100   setosa    :50  
##  1st Qu.:0.300   versicolor:50  
##  Median :1.300   virginica :50  
##  Mean   :1.199                  
##  3rd Qu.:1.800                  
##  Max.   :2.500

# indica los tipos de variables
str(IrisDataset)

## 'data.frame':    150 obs. of  6 variables:
##  $ X           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

names(IrisDataset)

## [1] "X"            "Sepal.Length" "Sepal.Width"  "Petal.Length"
## [5] "Petal.Width"  "Species"

# indica los nombres de las cabeceras
plot(IrisDataset)

plot(IrisDataset$Sepal.Length)

# Normal Quantile Plot- is a way to see if your data is normally distributed.
qqnorm(IrisDataset$Sepal.Length)

Empezamos a trabajar el modelo

# Matriz.Campo/columna <- as.tipo_de_variable(matriz.campo_a_transformar/introducir)
IrisDataset$Species<- as.numeric(IrisDataset$Species) 
#asignar semilla como numero 123
#probablemente para el sample???
set.seed(123)

Defimos el training y testing dataset

#round redondear
#nrow numero de filas en la matriz/vector
trainSize <- round(nrow(IrisDataset) * 0.8)
trainSize

## [1] 120

testSize <-nrow(IrisDataset) - trainSize
testSize

## [1] 30

#"size= " es redundante
#sample genera valores aleatorios (intervalo de valores, cantidad de valores(nunca mayor que el intervalo))
#training_indices<-sample(seq_len(nrow(carsDataSet)),sizie =tranSize) 
#seq_len(x) genera una secuencia del 1 hasta x.
# tambien se podria escribir training_indices<-sample(1:150, 120)
training_indices<-sample(seq_len(nrow(IrisDataset)),size = trainSize)

#referenciar una matriz funciona nombre_matriz[numero_de_fila/s,numero_de_columna] 
#si dejamos 
trainSet <- IrisDataset[training_indices,]

#el "-" actua como una negacion es decir metemos los que no cumplan con los indices
testSet <- IrisDataset[-training_indices,]

Generamos el modelo de predicción

#we are predicting petal width from petal length metiendo los resultados en LienarModel 
#impoortante recordar lm(variable dependiente ~ variable independiente, training dataset)
#es decir lm(y ~ f(x), donde aprende)
#tambien se puede definir una variable con la funcio y tulizar esa como #nombre
LinearModel<- lm(Petal.Width ~ Petal.Length,IrisDataset)

Comprobamos que tal es el modelo

#summary nos indica los errores y datos de interes
summary(LinearModel)

## 
## Call:
## lm(formula = Petal.Width ~ Petal.Length, data = IrisDataset)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.56515 -0.12358 -0.01898  0.13288  0.64272 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -0.363076   0.039762  -9.131  4.7e-16 ***
## Petal.Length  0.415755   0.009582  43.387  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2065 on 148 degrees of freedom
## Multiple R-squared:  0.9271, Adjusted R-squared:  0.9266 
## F-statistic:  1882 on 1 and 148 DF,  p-value: < 2.2e-16

predecimos los resultados

prediction<-predict(LinearModel,testSet)
prediction

##         1         2         3        11        18        19        28 
## 0.2189821 0.2189821 0.1774065 0.2605576 0.2189821 0.3437087 0.2605576 
##        33        36        48        55        56        57        58 
## 0.2605576 0.1358310 0.2189821 1.5493994 1.5078239 1.5909749 1.0089174 
##        59        61        62        65        66        70        77 
## 1.5493994 1.0920684 1.3830972 1.1336440 1.4662483 1.2583706 1.6325505 
##        83        84        98       100       105       113       125 
## 1.2583706 1.7572771 1.4246728 1.3415217 2.0483059 1.9235793 2.0067304 
##       131       141 
## 2.1730325 1.9651548

#plot(x,Y) hace un scater graph
plot(prediction)

#hitograma (variable)
hist(prediction)

Task 1 R

Alfonso Noguer

30/5/2019

1. CARS

2. Iris