lab1-behg-tr5.utf8.md

Laboratorio de Clase

library(readr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

bike<- read.csv("./train.csv")
glimpse(bike)

## Observations: 10,886
## Variables: 12
## $ datetime   <fct> 2011-01-01 00:00:00, 2011-01-01 01:00:00, 2011-01-0...
## $ season     <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ holiday    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ workingday <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ weather    <int> 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, ...
## $ temp       <dbl> 9.84, 9.02, 9.02, 9.84, 9.84, 9.84, 9.02, 8.20, 9.8...
## $ atemp      <dbl> 14.395, 13.635, 13.635, 14.395, 14.395, 12.880, 13....
## $ humidity   <int> 81, 80, 80, 75, 75, 75, 80, 86, 75, 76, 76, 81, 77,...
## $ windspeed  <dbl> 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 6.0032, 0.0...
## $ casual     <int> 3, 8, 5, 3, 0, 0, 2, 1, 1, 8, 12, 26, 29, 47, 35, 4...
## $ registered <int> 13, 32, 27, 10, 1, 1, 0, 2, 7, 6, 24, 30, 55, 47, 7...
## $ count      <int> 16, 40, 32, 13, 1, 1, 2, 3, 8, 14, 36, 56, 84, 94, ...

byr

bike_data<-
  bike %>%  
  dplyr::select(season, atemp, humidity)
head(bike_data)

##   season  atemp humidity
## 1      1 14.395       81
## 2      1 13.635       80
## 3      1 13.635       80
## 4      1 14.395       75
## 5      1 14.395       75
## 6      1 12.880       75

Generamos histograma por estacion y en el eje x la humedad

bike_data %>%
  mutate(season_label = case_when(
    season == 1 ~ "1. Spring",
    season == 2 ~ "2. Summer",
    season == 3 ~ "3. fall",
    season == 4 ~ "4. winter")) %>%
  ggplot( aes(x=humidity) ) +
  geom_histogram(bins=30,aes(y = ..density..)) +
  geom_rug()+
  geom_density()+
  facet_wrap(~season_label,ncol = 2)+
  ggtitle("Histogram of humidity by season")

Mostramos los histogramas por temporada y en el eje x la temperatura

bike_data %>%
  mutate(season_label = case_when(
    season == 1 ~ "1. Spring",
    season == 2 ~ "2. Summer",
    season == 3 ~ "3. fall",
    season == 4 ~ "4. winter")) %>%
  ggplot( aes(x=atemp) ) +
  geom_histogram(bins=30,aes(y = ..density..)) +
  geom_rug()+
  geom_density()+
  facet_wrap(~season_label,ncol = 2)+
  ggtitle("Histogram of atemp by season")

creamos breaks basado en un arreglo

bike_data$group<-cut(bike_data$atemp, breaks = c(0,10,20,30,40) )
bike_data %>% group_by(group)

## # A tibble: 10,886 x 4
## # Groups:   group [5]
##    season atemp humidity group  
##     <int> <dbl>    <int> <fct>  
##  1      1  14.4       81 (10,20]
##  2      1  13.6       80 (10,20]
##  3      1  13.6       80 (10,20]
##  4      1  14.4       75 (10,20]
##  5      1  14.4       75 (10,20]
##  6      1  12.9       75 (10,20]
##  7      1  13.6       80 (10,20]
##  8      1  12.9       86 (10,20]
##  9      1  14.4       75 (10,20]
## 10      1  17.4       76 (10,20]
## # ... with 10,876 more rows

creamos breaks basado en un quartile

bike_data$group<-cut(bike_data$atemp, breaks = quantile(bike_data$atemp,c(.0,.25,.5,.75,1)) )
bike_data %>% group_by(group)

## # A tibble: 10,886 x 4
## # Groups:   group [5]
##    season atemp humidity group      
##     <int> <dbl>    <int> <fct>      
##  1      1  14.4       81 (0.76,16.7]
##  2      1  13.6       80 (0.76,16.7]
##  3      1  13.6       80 (0.76,16.7]
##  4      1  14.4       75 (0.76,16.7]
##  5      1  14.4       75 (0.76,16.7]
##  6      1  12.9       75 (0.76,16.7]
##  7      1  13.6       80 (0.76,16.7]
##  8      1  12.9       86 (0.76,16.7]
##  9      1  14.4       75 (0.76,16.7]
## 10      1  17.4       76 (16.7,24.2]
## # ... with 10,876 more rows

Contruyendo por projections, considerando mutate pero usando length out + ggplot

bike_data %>% mutate(group= cut(bike_data$atemp, breaks = seq(.0,30, length.out = 30 ))) %>% group_by(group) %>% summarise(cnt=n())%>% ggplot(aes(x=group,y=cnt))+geom_point()

Construyendo la funcion de densidad, determinando la suma y la probabilidad

bike_data %>% mutate(group= cut(bike_data$atemp, breaks = seq(.0,30, length.out = 30 ))) %>% 
  group_by(group) %>% 
  summarise(cnt=n()) %>%
  ungroup()%>%
  mutate(sum=sum(cnt))%>%
  mutate(prob=cnt/sum)

## # A tibble: 30 x 4
##    group         cnt   sum      prob
##    <fct>       <int> <int>     <dbl>
##  1 (0,1.03]        2 10886 0.000184 
##  2 (1.03,2.07]     1 10886 0.0000919
##  3 (2.07,3.1]     14 10886 0.00129  
##  4 (3.1,4.14]     16 10886 0.00147  
##  5 (4.14,5.17]    11 10886 0.00101  
##  6 (5.17,6.21]    98 10886 0.00900  
##  7 (6.21,7.24]    63 10886 0.00579  
##  8 (7.24,8.28]    75 10886 0.00689  
##  9 (8.28,9.31]   170 10886 0.0156   
## 10 (9.31,10.3]   127 10886 0.0117   
## # ... with 20 more rows

verificaremos que la probabilidad realmente de 1, la columna prob es la densidad y debe de dar 1

bike_data %>% mutate(group= cut(bike_data$atemp, breaks = seq(.0,30, length.out = 30 ))) %>% 
  group_by(group) %>% 
  summarise(cnt=n()) %>%
  ungroup()%>%
  mutate(sum=sum(cnt))%>%
  mutate(prob=cnt/sum)%>%
  summarize(sum(prob))

## # A tibble: 1 x 1
##   `sum(prob)`
##         <dbl>
## 1           1

Utilizando KDE para relacionar las dos variables aleatorias y determinar la densidad Lasuma de Z no retorna Uno, asi que hay que normalizarlo

bike_data_season<- bike_data %>% filter(season==1)
bike_density<- kde2d(bike_data_season$atemp, bike_data_season$humidity, n=1000)
str(bike_density)

## List of 3
##  $ x: num [1:1000] 0.76 0.792 0.824 0.856 0.887 ...
##  $ y: num [1:1000] 0 0.1 0.2 0.3 0.4 ...
##  $ z: num [1:1000, 1:1000] 5.94e-15 6.37e-15 6.82e-15 7.31e-15 7.82e-15 ...

sum(bike_density$z)

## [1] 306.9665

Renormalizaremos z para que de 1 la sumatoria

bike_data_season<- bike_data %>% filter(season==1)
bike_density<- kde2d(bike_data_season$atemp, bike_data_season$humidity, n=1000)

bike_density$z<-bike_density$z/sum(bike_density$z)
sum(bike_density$z)

## [1] 1

Graficaremos contornos

bike_data_season<- bike_data %>% filter(season==1)
bike_density<- kde2d(bike_data_season$atemp, bike_data_season$humidity, n=1000)

bike_density$z<-bike_density$z/sum(bike_density$z)
contour(bike_density)
text(12.2,92,"1",cex=1.5)
points(12.2,89,col="red",pch=18)
text(10.4,47,"2",cex=1.5)
points(10.4,43,col="red",pch=18)
text(21,45,"3",cex=1.5)
points(21,40,col="red",pch=18)
text(21.6,82,"4",cex=1.5)
points(21.6,78,col="red",pch=18)

Graficadora con estacla de colores

hm_col_scale<-colorRampPalette(c("black","blue","green","orange","red"))(1000)
image(bike_density$z,  
      col = hm_col_scale,
      zlim=c(min(bike_density$z), max(bike_density$z)))
text(0.31,0.46,"Most Frequent",col="blue",cex=0.8)
points(0.31,0.45,col="blue",pch=18)

REstricciones de valores basado en indices utilizaremos wich para seleccionar solo las posiciones que cumplen las condiciones que se definan

atem_res<-which(bike_density$x>10 & bike_density$x<15)
hum_res<-which(bike_density$y>30 & bike_density$y<60)
prob_en_rango<-bike_density$z[atem_res,hum_res]
sum(prob_en_rango)

## [1] 0.1684039

Laboratorio por funciones: Dado un par de valorer contenidos en la función de probabilidad conjunta y un valor de longitud, determinar la probabilidad contenida sobre la línea que se forma en el eje x y en el eje y del punto.

Pregunta 1 Funcion unidimensional:

f_rango <- function(mat_source, temp_inicial, temp_final, humedad_inicial,humedad_final )
{

atem_res<-which(mat_source$x>temp_inicial & mat_source$x<temp_final)
hum_res<-which(mat_source$y>humedad_inicial & mat_source$y<humedad_final)
prob_en_rango<-mat_source$z[atem_res,hum_res]
rest<-sum(prob_en_rango)
return( rest)

}

f_hasta_punto <- function(mat_source, temp_final, humedad_final )
{

return( f_rango(mat_source,0,temp_final,0,humedad_final))

}


print('Generando funicon por rango (cuadrado)')

## [1] "Generando funicon por rango (cuadrado)"

f_rango(bike_density,10,15,30,60)

## [1] 0.1684039

print('Generando funicon por data un punto dese el (0,0)')

## [1] "Generando funicon por data un punto dese el (0,0)"

f_hasta_punto(bike_density,15,60)

## [1] 0.3381322

Funcion de probabilidad de un circulo mascara:

f_circulo_mask <- function(mat_source, centro_temp, centro_humedad, radio )
{
  img<-matrix(0,ncol=1000,nrow=1000)
  rcuad<-radio^2
  for (i in 1:1000){
    for (j in 1:1000){
      x<-mat_source$x[i]
      y<-mat_source$y[j]
      xcuad<-(x-centro_temp)^2
      ycuad<-(y-centro_humedad)^2
      sumcuad<-xcuad+ycuad
      if(sumcuad<rcuad)
      {
        img[i,j]<- 1
      }
      
    }
  }

  return (img)
}

f_circulo_prob<- function(mat_source, centro_temp, centro_humedad, radio )
{
  mask<-f_circulo_mask(mat_source, centro_temp, centro_humedad, radio )
  prob_en_rango<-mat_source$z*mask
  rest<-sum(prob_en_rango)
return( rest)
}

img=f_circulo_mask(bike_density,12,30,10)
image(0:1000, 0:1000,img )

valor de probabilidad dentro de la mascara

print('Prob')

## [1] "Prob"

f_circulo_prob(bike_density,15,40,5)

## [1] 0.07292686