Clase 1

Carga y limpieza de datos

Los datos analizados acá provienen de DataIntelligence Chile

pd <- import("pandas")
datos <- pd$read_csv("covid_19_clean_complete.csv")
kable(head(datos, 10))

Province/State	Country/Region	Lat	Long	Date
NaN	Afghanistan	33.0000	65.0000	1/22/20
NaN	Albania	41.1533	20.1683	1/22/20
NaN	Algeria	28.0339	1.6596	1/22/20
NaN	Andorra	42.5063	1.5218	1/22/20
NaN	Angola	-11.2027	17.8739	1/22/20
NaN	Antigua and Barbuda	17.0608	-61.7964	1/22/20
NaN	Argentina	-38.4161	-63.6167	1/22/20
NaN	Armenia	40.0691	45.0382	1/22/20
Australian Capital Territory	Australia	-35.4735	149.0124	1/22/20
New South Wales	Australia	-33.8688	151.2093	1/22/20

datos <- read.csv("covid_19_clean_complete.csv")
kable(head(datos, 10))

Province.State	Country.Region	Lat	Long	Date
	Afghanistan	33.0000	65.0000	1/22/20
	Albania	41.1533	20.1683	1/22/20
	Algeria	28.0339	1.6596	1/22/20
	Andorra	42.5063	1.5218	1/22/20
	Angola	-11.2027	17.8739	1/22/20
	Antigua and Barbuda	17.0608	-61.7964	1/22/20
	Argentina	-38.4161	-63.6167	1/22/20
	Armenia	40.0691	45.0382	1/22/20
Australian Capital Territory	Australia	-35.4735	149.0124	1/22/20
New South Wales	Australia	-33.8688	151.2093	1/22/20

datos <- read.csv("covid_19_clean_complete.csv")
datos %>% head(10) %>% kable()

Province.State	Country.Region	Lat	Long	Date
	Afghanistan	33.0000	65.0000	1/22/20
	Albania	41.1533	20.1683	1/22/20
	Algeria	28.0339	1.6596	1/22/20
	Andorra	42.5063	1.5218	1/22/20
	Angola	-11.2027	17.8739	1/22/20
	Antigua and Barbuda	17.0608	-61.7964	1/22/20
	Argentina	-38.4161	-63.6167	1/22/20
	Armenia	40.0691	45.0382	1/22/20
Australian Capital Territory	Australia	-35.4735	149.0124	1/22/20
New South Wales	Australia	-33.8688	151.2093	1/22/20

Clase 2

str(datos)

## 'data.frame':    26400 obs. of  8 variables:
##  $ Province.State: chr  "" "" "" "" ...
##  $ Country.Region: chr  "Afghanistan" "Albania" "Algeria" "Andorra" ...
##  $ Lat           : num  33 41.2 28 42.5 -11.2 ...
##  $ Long          : num  65 20.17 1.66 1.52 17.87 ...
##  $ Date          : chr  "1/22/20" "1/22/20" "1/22/20" "1/22/20" ...
##  $ Confirmed     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Deaths        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Recovered     : int  0 0 0 0 0 0 0 0 0 0 ...

colnames(datos) = c("Provincia_Estado", "Pais_Region", "Latitud", "Longitud", "Fecha", "Casos_Confirmados", "Casos_Muertos", "Casos_Recuperados")

datos %>% head() %>% kable() %>% kable_styling(bootstrap_options = c("striped", "hover"))

Pais_Region	Latitud	Longitud	Fecha
Afghanistan	33.0000	65.0000	1/22/20
Albania	41.1533	20.1683	1/22/20
Algeria	28.0339	1.6596	1/22/20
Andorra	42.5063	1.5218	1/22/20
Angola	-11.2027	17.8739	1/22/20
Antigua and Barbuda	17.0608	-61.7964	1/22/20

Clase 3

Cualitativas se convierten con factor o bien con as.factor.
Ordinales se convierten con ordered.
Cuantitativas se convierten con as.numeric.

datos$Fecha = ordered(datos$Fecha)
datos$Provincia_Estado = factor(datos$Provincia_Estado)
datos$Pais_Region = factor(datos$Pais_Region)

str(datos)

## 'data.frame':    26400 obs. of  8 variables:
##  $ Provincia_Estado : Factor w/ 81 levels "","Alberta","Anguilla",..: 1 1 1 1 1 1 1 1 6 49 ...
##  $ Pais_Region      : Factor w/ 187 levels "Afghanistan",..: 1 2 3 4 5 6 7 8 9 9 ...
##  $ Latitud          : num  33 41.2 28 42.5 -11.2 ...
##  $ Longitud         : num  65 20.17 1.66 1.52 17.87 ...
##  $ Fecha            : Ord.factor w/ 100 levels "1/22/20"<"1/23/20"<..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Casos_Confirmados: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Casos_Muertos    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Casos_Recuperados: int  0 0 0 0 0 0 0 0 0 0 ...

Clase 4: la manipulación del dato tipo fecha

datos$Provincia_Estado %<>% factor()
datos$Pais_Region %<>% factor()
datos$Fecha %<>% as.Date(format="%m/%d/%y")
str(datos)

## 'data.frame':    26400 obs. of  8 variables:
##  $ Provincia_Estado : Factor w/ 81 levels "","Alberta","Anguilla",..: 1 1 1 1 1 1 1 1 6 49 ...
##  $ Pais_Region      : Factor w/ 187 levels "Afghanistan",..: 1 2 3 4 5 6 7 8 9 9 ...
##  $ Latitud          : num  33 41.2 28 42.5 -11.2 ...
##  $ Longitud         : num  65 20.17 1.66 1.52 17.87 ...
##  $ Fecha            : Date, format: "2020-01-22" "2020-01-22" ...
##  $ Casos_Confirmados: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Casos_Muertos    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Casos_Recuperados: int  0 0 0 0 0 0 0 0 0 0 ...

Clase 5 la librería lubridate

covid19_data<-read_csv("us-counties.csv") #function that reads in csv files

## Parsed with column specification:
## cols(
##   date = col_date(format = ""),
##   county = col_character(),
##   state = col_character(),
##   fips = col_character(),
##   cases = col_double(),
##   deaths = col_double()
## )

NJ_covid19<-covid19_data%>%
  dplyr::filter(state == "New Jersey",county != "Unknown") #filters data frame
NJ_counties=st_read("C:/Users/usuario/Documents/GitHub/covid19/New_Jersey_Counties/data.shp")

## Reading layer `data' from data source `C:\Users\usuario\Documents\GitHub\covid19\New_Jersey_Counties\data.shp' using driver `ESRI Shapefile'
## Simple feature collection with 21 features and 22 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: 193684.7 ymin: 34945.75 xmax: 657059.7 ymax: 919556.3
## proj4string:    +proj=tmerc +lat_0=38.83333333333334 +lon_0=-74.5 +k=0.9999 +x_0=150000 +y_0=0 +datum=NAD83 +units=us-ft +no_defs

plot(st_geometry(NJ_counties))

names(NJ_counties)<-tolower(names(NJ_counties))

NJ_counties$county<-tolower(NJ_counties$county)


NJ_covid19$county<-tolower(NJ_covid19$county)

NJ_covid19_shapes<-left_join(NJ_covid19,NJ_counties,by="county")%>%
  dplyr::select(date,county,state,cases,deaths,geometry)

head(NJ_covid19_shapes)

## # A tibble: 6 x 6
##   date       county state   cases deaths                                geometry
##   <date>     <chr>  <chr>   <dbl>  <dbl>         <MULTIPOLYGON [US_survey_foot]>
## 1 2020-03-04 bergen New Je~     1      0 (((656201 783614.4, 656141.1 783413.6,~
## 2 2020-03-05 bergen New Je~     2      0 (((656201 783614.4, 656141.1 783413.6,~
## 3 2020-03-06 bergen New Je~     3      0 (((656201 783614.4, 656141.1 783413.6,~
## 4 2020-03-06 camden New Je~     1      0 (((342764 423475.8, 342804.1 423429.3,~
## 5 2020-03-07 bergen New Je~     3      0 (((656201 783614.4, 656141.1 783413.6,~
## 6 2020-03-07 camden New Je~     1      0 (((342764 423475.8, 342804.1 423429.3,~

NJ_covid19_shapes<-st_as_sf(NJ_covid19_shapes)



covid_map<-ggplot()+
  geom_sf(data = NJ_counties,fill = "white")+
  geom_sf(data = NJ_covid19_shapes,aes(fill=cases))+
  ggtitle("Spread of COVID-19 Throughout New Jersey")+
  xlab("")+
  ylab("")+
  labs(subtitle = "Date: {current_frame}",
       caption = "Data Source: The New York Times\nAuthor: Kevin Zolea")+
  cowplot::background_grid(major = "none", minor = "none") +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(),
        axis.text.y = element_blank(), axis.ticks.y = element_blank(),
        axis.line = element_blank(),
        legend.background = element_blank(),
        legend.position=c(-0.3,0.8),
        plot.background = element_blank(),
        panel.background = element_blank(),
        legend.text = element_text(size=12),
        legend.title = element_text(colour="black", size=12, face="bold"),
        plot.title=element_text(size=20, face="bold",hjust =0.5),
        plot.subtitle = element_text(hjust = 0.5,size=12),
        plot.caption = element_text(size = 11,
                                    hjust = .5,
                                    color = "black",
                                    face = "bold"))+
  scale_fill_distiller("Number of Positive Cases",
                       palette ="Reds",type = "div",
                       direction = 1)+
  transition_manual(date)

animate(covid_map, nframe=27,fps = 2, end_pause = 15,height = 500, width =500)

library(tidyverse)
library(janitor)

## 
## Attaching package: 'janitor'

## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

library(stringr)
library(tidyverse)
library(gganimate)
library(png)
options(warn = - 1)  
gdp <- read.csv("GDP_Data.csv")
str(gdp)

## 'data.frame':    269 obs. of  16 variables:
##  $ ï..Series.Name: chr  "GDP (current US$)" "GDP (current US$)" "GDP (current US$)" "GDP (current US$)" ...
##  $ Series.Code   : chr  "NY.GDP.MKTP.CD" "NY.GDP.MKTP.CD" "NY.GDP.MKTP.CD" "NY.GDP.MKTP.CD" ...
##  $ Country.Name  : chr  "Afghanistan" "Albania" "Algeria" "American Samoa" ...
##  $ Country.Code  : chr  "AFG" "ALB" "DZA" "ASM" ...
##  $ X1990..YR1990.: chr  ".." "2028553750" "62045099642.7774" ".." ...
##  $ X2000..YR2000.: chr  ".." "3480355258.04122" "54790245600.5846" ".." ...
##  $ X2009..YR2009.: chr  "12439087076.7667" "12044208085.864" "137211039898.193" "678000000" ...
##  $ X2010..YR2010.: chr  "15856574731.4411" "11926957254.6288" "161207268655.392" "576000000" ...
##  $ X2011..YR2011.: chr  "17804292964.1045" "12890866742.6533" "200019057307.655" "574000000" ...
##  $ X2012..YR2012.: chr  "19907317065.6667" "12319784886.2038" "209058991952.125" "644000000" ...
##  $ X2013..YR2013.: chr  "20561069558.2152" "12776280961.155" "209755003250.664" "641000000" ...
##  $ X2014..YR2014.: chr  "20484885119.7348" "13228247844.1247" "213810022462.428" "643000000" ...
##  $ X2015..YR2015.: chr  "19907111418.9938" "11386931489.7968" "165979277276.907" "661000000" ...
##  $ X2016..YR2016.: chr  "19046357714.4928" "11883682170.8236" "160129866569.935" "653000000" ...
##  $ X2017..YR2017.: chr  "19543976895.4248" "13038538300.2644" "167555280113.181" "634000000" ...
##  $ X2018..YR2018.: chr  ".." ".." ".." ".." ...

head(gdp)

##      ï..Series.Name    Series.Code   Country.Name Country.Code   X1990..YR1990.
## 1 GDP (current US$) NY.GDP.MKTP.CD    Afghanistan          AFG               ..
## 2 GDP (current US$) NY.GDP.MKTP.CD        Albania          ALB       2028553750
## 3 GDP (current US$) NY.GDP.MKTP.CD        Algeria          DZA 62045099642.7774
## 4 GDP (current US$) NY.GDP.MKTP.CD American Samoa          ASM               ..
## 5 GDP (current US$) NY.GDP.MKTP.CD        Andorra          AND 1029048481.88051
## 6 GDP (current US$) NY.GDP.MKTP.CD         Angola          AGO 11228764963.1618
##     X2000..YR2000.   X2009..YR2009.   X2010..YR2010.   X2011..YR2011.
## 1               .. 12439087076.7667 15856574731.4411 17804292964.1045
## 2 3480355258.04122  12044208085.864 11926957254.6288 12890866742.6533
## 3 54790245600.5846 137211039898.193 161207268655.392 200019057307.655
## 4               ..        678000000        576000000        574000000
## 5 1434429703.33518 3660530702.97305 3355695364.23841 3442062830.13622
## 6 9129594818.60749 70307163678.1895 83799496611.6049  111789686464.26
##     X2012..YR2012.   X2013..YR2013.   X2014..YR2014.   X2015..YR2015.
## 1 19907317065.6667 20561069558.2152 20484885119.7348 19907111418.9938
## 2 12319784886.2038  12776280961.155 13228247844.1247 11386931489.7968
## 3 209058991952.125 209755003250.664 213810022462.428 165979277276.907
## 4        644000000        641000000        643000000        661000000
## 5 3164615186.94591 3281585236.32501 3350736367.25488 2811489408.89431
## 6 128052853643.447 136709862831.308 145712200312.505 116193649124.475
##     X2016..YR2016.   X2017..YR2017. X2018..YR2018.
## 1 19046357714.4928 19543976895.4248             ..
## 2 11883682170.8236 13038538300.2644             ..
## 3 160129866569.935 167555280113.181             ..
## 4        653000000        634000000             ..
## 5 2877311946.90265 3012914131.16971             ..
## 6 101123851090.473  122123822333.73             ..

#select required columns
gdp <- gdp %>% select(3:15)

#filter only country rows
gdp <- gdp[1:217,]

gdp_tidy <- gdp %>% 
  mutate_at(vars(contains("YR")),as.numeric) %>% 
  gather(year,value,3:13) %>% 
  janitor::clean_names() %>% #fix column name
  mutate(year = as.numeric(stringr::str_sub(year,1,4)))


summary(gdp_tidy)

##  country_name       country_code            year          value          
##  Length:2387        Length:2387        Min.   : NA    Min.   :8.824e+06  
##  Class :character   Class :character   1st Qu.: NA    1st Qu.:4.435e+09  
##  Mode  :character   Mode  :character   Median : NA    Median :2.020e+10  
##                                        Mean   :NaN    Mean   :3.241e+11  
##                                        3rd Qu.: NA    3rd Qu.:1.384e+11  
##                                        Max.   : NA    Max.   :1.939e+13  
##                                        NA's   :2387   NA's   :193

str(gdp_tidy)

## 'data.frame':    2387 obs. of  4 variables:
##  $ country_name: chr  "Afghanistan" "Albania" "Algeria" "American Samoa" ...
##  $ country_code: chr  "AFG" "ALB" "DZA" "ASM" ...
##  $ year        : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ value       : num  NA 2.03e+09 6.20e+10 NA 1.03e+09 ...

head(gdp_tidy)

##     country_name country_code year       value
## 1    Afghanistan          AFG   NA          NA
## 2        Albania          ALB   NA  2028553750
## 3        Algeria          DZA   NA 62045099643
## 4 American Samoa          ASM   NA          NA
## 5        Andorra          AND   NA  1029048482
## 6         Angola          AGO   NA 11228764963

gdp_tidy <- read_csv("gdp_tidy.csv")

## Parsed with column specification:
## cols(
##   country_name = col_character(),
##   country_code = col_character(),
##   year = col_double(),
##   value = col_double()
## )

summary(gdp_tidy)

##  country_name       country_code            year          value          
##  Length:2387        Length:2387        Min.   :1990   Min.   :8.824e+06  
##  Class :character   Class :character   1st Qu.:2009   1st Qu.:4.435e+09  
##  Mode  :character   Mode  :character   Median :2012   Median :2.020e+10  
##                                        Mean   :2010   Mean   :3.241e+11  
##                                        3rd Qu.:2015   3rd Qu.:1.384e+11  
##                                        Max.   :2017   Max.   :1.939e+13  
##                                                       NA's   :193

str(gdp_tidy)

## tibble [2,387 x 4] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ country_name: chr [1:2387] "Afghanistan" "Albania" "Algeria" "American Samoa" ...
##  $ country_code: chr [1:2387] "AFG" "ALB" "DZA" "ASM" ...
##  $ year        : num [1:2387] 1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
##  $ value       : num [1:2387] NA 2.03e+09 6.20e+10 NA 1.03e+09 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   country_name = col_character(),
##   ..   country_code = col_character(),
##   ..   year = col_double(),
##   ..   value = col_double()
##   .. )

head(gdp_tidy)

## # A tibble: 6 x 4
##   country_name   country_code  year        value
##   <chr>          <chr>        <dbl>        <dbl>
## 1 Afghanistan    AFG           1990          NA 
## 2 Albania        ALB           1990  2028553750 
## 3 Algeria        DZA           1990 62045099643.
## 4 American Samoa ASM           1990          NA 
## 5 Andorra        AND           1990  1029048482.
## 6 Angola         AGO           1990 11228764963.

gdp_formatted <- gdp_tidy %>%
  group_by(year) %>%
  # The * 1 makes it possible to have non-integer ranks while sliding
  mutate(rank = rank(-value),
         Value_rel = value/value[rank==1],
         Value_lbl = paste0(" ",round(value/1e9))) %>%
  group_by(country_name) %>% 
  filter(rank <=10) %>%
  ungroup()

staticplot = ggplot(gdp_formatted, aes(rank, group = country_name, 
                fill = as.factor(country_name), color = as.factor(country_name))) +
  geom_tile(aes(y = value/2,
                height = value,
                width = 0.9), alpha = 0.8, color = NA) +
  geom_text(aes(y = 0, label = paste(country_name, " ")), vjust = 0.2, hjust = 1) +
  geom_text(aes(y=value,label = Value_lbl, hjust=0)) +
  coord_flip(clip = "off", expand = FALSE) +
  scale_y_continuous(labels = scales::comma) +
  scale_x_reverse() +
  guides(color = FALSE, fill = FALSE) +
  theme(axis.line=element_blank(),
        axis.text.x=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks=element_blank(),
        axis.title.x=element_blank(),
         axis.title.y=element_blank(),
        legend.position="none",
        panel.background=element_blank(),
        panel.border=element_blank(),
        panel.grid.major=element_blank(),
        panel.grid.minor=element_blank(),
        panel.grid.major.x = element_line( size=.1, color="grey" ),
        panel.grid.minor.x = element_line( size=.1, color="grey" ),
        plot.title=element_text(size=25, hjust=0.5, face="bold", colour="grey", vjust=-1),
        plot.subtitle=element_text(size=18, hjust=0.5, face="italic", color="grey"),
        plot.caption =element_text(size=8, hjust=0.5, face="italic", color="grey"),
        plot.background=element_blank(),
       plot.margin = margin(2,2, 2, 4, "cm"))


anim = staticplot + transition_states(year, transition_length = 4, state_length = 1) +
  view_follow(fixed_x = TRUE)  +
  labs(title = 'GDP per Year : {closest_state}',  
       subtitle  =  "Top 10 Countries",
       caption  = "GDP in Billions USD | Data Source: World Bank Data")

# For GIF
animate(anim, 200, fps = 20,  width = 1200, height = 1000, 
        renderer = gifski_renderer("gganim.gif"))

Analisis covid19

Christian Castro

01-05-2020

Clase 1

Carga y limpieza de datos

Clase 2

Clase 3

Clase 4: la manipulación del dato tipo fecha

Clase 5 la librería lubridate