Laboratorio de Clase
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
bike<- read.csv("./train.csv")
glimpse(bike)
## Observations: 10,886
## Variables: 12
## $ datetime <fct> 2011-01-01 00:00:00, 2011-01-01 01:00:00, 2011-01-0...
## $ season <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ holiday <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ workingday <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ weather <int> 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, ...
## $ temp <dbl> 9.84, 9.02, 9.02, 9.84, 9.84, 9.84, 9.02, 8.20, 9.8...
## $ atemp <dbl> 14.395, 13.635, 13.635, 14.395, 14.395, 12.880, 13....
## $ humidity <int> 81, 80, 80, 75, 75, 75, 80, 86, 75, 76, 76, 81, 77,...
## $ windspeed <dbl> 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 6.0032, 0.0...
## $ casual <int> 3, 8, 5, 3, 0, 0, 2, 1, 1, 8, 12, 26, 29, 47, 35, 4...
## $ registered <int> 13, 32, 27, 10, 1, 1, 0, 2, 7, 6, 24, 30, 55, 47, 7...
## $ count <int> 16, 40, 32, 13, 1, 1, 2, 3, 8, 14, 36, 56, 84, 94, ...
byr
bike_data<-
bike %>%
dplyr::select(season, atemp, humidity)
head(bike_data)
## season atemp humidity
## 1 1 14.395 81
## 2 1 13.635 80
## 3 1 13.635 80
## 4 1 14.395 75
## 5 1 14.395 75
## 6 1 12.880 75
Generamos histograma por estacion y en el eje x la humedad
bike_data %>%
mutate(season_label = case_when(
season == 1 ~ "1. Spring",
season == 2 ~ "2. Summer",
season == 3 ~ "3. fall",
season == 4 ~ "4. winter")) %>%
ggplot( aes(x=humidity) ) +
geom_histogram(bins=30,aes(y = ..density..)) +
geom_rug()+
geom_density()+
facet_wrap(~season_label,ncol = 2)+
ggtitle("Histogram of humidity by season")
Mostramos los histogramas por temporada y en el eje x la temperatura
bike_data %>%
mutate(season_label = case_when(
season == 1 ~ "1. Spring",
season == 2 ~ "2. Summer",
season == 3 ~ "3. fall",
season == 4 ~ "4. winter")) %>%
ggplot( aes(x=atemp) ) +
geom_histogram(bins=30,aes(y = ..density..)) +
geom_rug()+
geom_density()+
facet_wrap(~season_label,ncol = 2)+
ggtitle("Histogram of atemp by season")
creamos breaks basado en un arreglo
bike_data$group<-cut(bike_data$atemp, breaks = c(0,10,20,30,40) )
bike_data %>% group_by(group)
## # A tibble: 10,886 x 4
## # Groups: group [5]
## season atemp humidity group
## <int> <dbl> <int> <fct>
## 1 1 14.4 81 (10,20]
## 2 1 13.6 80 (10,20]
## 3 1 13.6 80 (10,20]
## 4 1 14.4 75 (10,20]
## 5 1 14.4 75 (10,20]
## 6 1 12.9 75 (10,20]
## 7 1 13.6 80 (10,20]
## 8 1 12.9 86 (10,20]
## 9 1 14.4 75 (10,20]
## 10 1 17.4 76 (10,20]
## # ... with 10,876 more rows
creamos breaks basado en un quartile
bike_data$group<-cut(bike_data$atemp, breaks = quantile(bike_data$atemp,c(.0,.25,.5,.75,1)) )
bike_data %>% group_by(group)
## # A tibble: 10,886 x 4
## # Groups: group [5]
## season atemp humidity group
## <int> <dbl> <int> <fct>
## 1 1 14.4 81 (0.76,16.7]
## 2 1 13.6 80 (0.76,16.7]
## 3 1 13.6 80 (0.76,16.7]
## 4 1 14.4 75 (0.76,16.7]
## 5 1 14.4 75 (0.76,16.7]
## 6 1 12.9 75 (0.76,16.7]
## 7 1 13.6 80 (0.76,16.7]
## 8 1 12.9 86 (0.76,16.7]
## 9 1 14.4 75 (0.76,16.7]
## 10 1 17.4 76 (16.7,24.2]
## # ... with 10,876 more rows
Contruyendo por projections, considerando mutate pero usando length out + ggplot
bike_data %>% mutate(group= cut(bike_data$atemp, breaks = seq(.0,30, length.out = 30 ))) %>% group_by(group) %>% summarise(cnt=n())%>% ggplot(aes(x=group,y=cnt))+geom_point()
Construyendo la funcion de densidad, determinando la suma y la probabilidad
bike_data %>% mutate(group= cut(bike_data$atemp, breaks = seq(.0,30, length.out = 30 ))) %>%
group_by(group) %>%
summarise(cnt=n()) %>%
ungroup()%>%
mutate(sum=sum(cnt))%>%
mutate(prob=cnt/sum)
## # A tibble: 30 x 4
## group cnt sum prob
## <fct> <int> <int> <dbl>
## 1 (0,1.03] 2 10886 0.000184
## 2 (1.03,2.07] 1 10886 0.0000919
## 3 (2.07,3.1] 14 10886 0.00129
## 4 (3.1,4.14] 16 10886 0.00147
## 5 (4.14,5.17] 11 10886 0.00101
## 6 (5.17,6.21] 98 10886 0.00900
## 7 (6.21,7.24] 63 10886 0.00579
## 8 (7.24,8.28] 75 10886 0.00689
## 9 (8.28,9.31] 170 10886 0.0156
## 10 (9.31,10.3] 127 10886 0.0117
## # ... with 20 more rows
verificaremos que la probabilidad realmente de 1, la columna prob es la densidad y debe de dar 1
bike_data %>% mutate(group= cut(bike_data$atemp, breaks = seq(.0,30, length.out = 30 ))) %>%
group_by(group) %>%
summarise(cnt=n()) %>%
ungroup()%>%
mutate(sum=sum(cnt))%>%
mutate(prob=cnt/sum)%>%
summarize(sum(prob))
## # A tibble: 1 x 1
## `sum(prob)`
## <dbl>
## 1 1
Utilizando KDE para relacionar las dos variables aleatorias y determinar la densidad Lasuma de Z no retorna Uno, asi que hay que normalizarlo
bike_data_season<- bike_data %>% filter(season==1)
bike_density<- kde2d(bike_data_season$atemp, bike_data_season$humidity, n=1000)
str(bike_density)
## List of 3
## $ x: num [1:1000] 0.76 0.792 0.824 0.856 0.887 ...
## $ y: num [1:1000] 0 0.1 0.2 0.3 0.4 ...
## $ z: num [1:1000, 1:1000] 5.94e-15 6.37e-15 6.82e-15 7.31e-15 7.82e-15 ...
sum(bike_density$z)
## [1] 306.9665
Renormalizaremos z para que de 1 la sumatoria
bike_data_season<- bike_data %>% filter(season==1)
bike_density<- kde2d(bike_data_season$atemp, bike_data_season$humidity, n=1000)
bike_density$z<-bike_density$z/sum(bike_density$z)
sum(bike_density$z)
## [1] 1
Graficaremos contornos
bike_data_season<- bike_data %>% filter(season==1)
bike_density<- kde2d(bike_data_season$atemp, bike_data_season$humidity, n=1000)
bike_density$z<-bike_density$z/sum(bike_density$z)
contour(bike_density)
text(12.2,92,"1",cex=1.5)
points(12.2,89,col="red",pch=18)
text(10.4,47,"2",cex=1.5)
points(10.4,43,col="red",pch=18)
text(21,45,"3",cex=1.5)
points(21,40,col="red",pch=18)
text(21.6,82,"4",cex=1.5)
points(21.6,78,col="red",pch=18)
Graficadora con estacla de colores
hm_col_scale<-colorRampPalette(c("black","blue","green","orange","red"))(1000)
image(bike_density$z,
col = hm_col_scale,
zlim=c(min(bike_density$z), max(bike_density$z)))
text(0.31,0.46,"Most Frequent",col="blue",cex=0.8)
points(0.31,0.45,col="blue",pch=18)
REstricciones de valores basado en indices utilizaremos wich para seleccionar solo las posiciones que cumplen las condiciones que se definan
atem_res<-which(bike_density$x>10 & bike_density$x<15)
hum_res<-which(bike_density$y>30 & bike_density$y<60)
prob_en_rango<-bike_density$z[atem_res,hum_res]
sum(prob_en_rango)
## [1] 0.1684039
Laboratorio por funciones: Dado un par de valorer contenidos en la función de probabilidad conjunta y un valor de longitud, determinar la probabilidad contenida sobre la línea que se forma en el eje x y en el eje y del punto.
Pregunta 1 Funcion unidimensional:
f_rango <- function(mat_source, temp_inicial, temp_final, humedad_inicial,humedad_final )
{
atem_res<-which(mat_source$x>temp_inicial & mat_source$x<temp_final)
hum_res<-which(mat_source$y>humedad_inicial & mat_source$y<humedad_final)
prob_en_rango<-mat_source$z[atem_res,hum_res]
rest<-sum(prob_en_rango)
return( rest)
}
f_hasta_punto <- function(mat_source, temp_final, humedad_final )
{
return( f_rango(mat_source,0,temp_final,0,humedad_final))
}
print('Generando funicon por rango (cuadrado)')
## [1] "Generando funicon por rango (cuadrado)"
f_rango(bike_density,10,15,30,60)
## [1] 0.1684039
print('Generando funicon por data un punto dese el (0,0)')
## [1] "Generando funicon por data un punto dese el (0,0)"
f_hasta_punto(bike_density,15,60)
## [1] 0.3381322
Funcion de probabilidad de un circulo mascara:
f_circulo_mask <- function(mat_source, centro_temp, centro_humedad, radio )
{
img<-matrix(0,ncol=1000,nrow=1000)
rcuad<-radio^2
for (i in 1:1000){
for (j in 1:1000){
x<-mat_source$x[i]
y<-mat_source$y[j]
xcuad<-(x-centro_temp)^2
ycuad<-(y-centro_humedad)^2
sumcuad<-xcuad+ycuad
if(sumcuad<rcuad)
{
img[i,j]<- 1
}
}
}
return (img)
}
f_circulo_prob<- function(mat_source, centro_temp, centro_humedad, radio )
{
mask<-f_circulo_mask(mat_source, centro_temp, centro_humedad, radio )
prob_en_rango<-mat_source$z*mask
rest<-sum(prob_en_rango)
return( rest)
}
img=f_circulo_mask(bike_density,12,30,10)
image(0:1000, 0:1000,img )
valor de probabilidad dentro de la mascara
print('Prob')
## [1] "Prob"
f_circulo_prob(bike_density,15,40,5)
## [1] 0.07292686