Ejercicio 1
1.Filtrar los registros para incluir únicamente los incendios ocurridos
en el estado de Idaho.
library(readr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.0 ✔ purrr 1.2.1
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
df1 <- read_csv("StudyArea.csv")
## Rows: 439362 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): ORGANIZATI, UNIT, SUBUNIT, SUBUNIT2, FIRENAME, CAUSE, STARTDATED, ...
## dbl (4): FID, YEAR_, STATE_FIPS, TOTALACRES
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df1 %>%
filter(STATE=="Idaho")
## # A tibble: 36,510 × 14
## FID ORGANIZATI UNIT SUBUNIT SUBUNIT2 FIRENAME CAUSE YEAR_ STARTDATED
## <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <chr>
## 1 3971 FWS 14613 USIDBLR Bear Lake Nat… Y ROAD Human 1987 5/7/87 0:…
## 2 3972 FWS 14613 USIDBLR Bear Lake Nat… LIFTON Natu… 1991 5/2/91 0:…
## 3 3973 FWS 14613 USIDBLR Bear Lake Nat… SPRING … Human 1991 5/20/91 0…
## 4 3974 FWS 14613 USIDBLR Bear Lake Nat… RAINBOW Natu… 1990 6/9/90 0:…
## 5 3975 FWS 14613 USIDBLR Bear Lake Nat… RAINBOW… Human 1985 4/18/85 0…
## 6 3976 FWS 14613 USIDBLR Bear Lake Nat… HOAGESON Human 1988 10/24/88 …
## 7 3977 FWS 14613 USIDBLR Bear Lake Nat… MERKLEY Natu… 1991 7/25/91 0…
## 8 3978 FWS 14613 USIDBLR Bear Lake Nat… MERKLEY Human 1992 9/2/92 0:…
## 9 3979 FWS 14613 USIDBLR Bear Lake Nat… MERKLEY Human 2002 3/29/02 0…
## 10 3980 FWS 14613 USIDBLR Bear Lake Nat… N MERKL… Human 1994 7/11/94 0…
## # ℹ 36,500 more rows
## # ℹ 5 more variables: CONTRDATED <chr>, OUTDATED <chr>, STATE <chr>,
## # STATE_FIPS <dbl>, TOTALACRES <dbl>
2.Seleccionar únicamente las columnas YEAR_, CAUSE y
TOTALACRES.
df1%>%
select("YEAR_","CAUSE","TOTALACRES")
## # A tibble: 439,362 × 3
## YEAR_ CAUSE TOTALACRES
## <dbl> <chr> <dbl>
## 1 2001 Human 0.1
## 2 2002 Human 3
## 3 2002 Human 0.5
## 4 2001 Human 0.1
## 5 1994 Human 1
## 6 1994 Human 0.1
## 7 1999 Human 3
## 8 2003 Human 0.1
## 9 2005 Human 0.1
## 10 2005 Human 0.1
## # ℹ 439,352 more rows
3.Renombrar estas columnas con nombres más claros y
descriptivos.
df1 %>%
select("AÑO"=YEAR_,"CAUSA DEL INCENDIO"=CAUSE,"TOTAL DE ACRES"=TOTALACRES)
## # A tibble: 439,362 × 3
## AÑO `CAUSA DEL INCENDIO` `TOTAL DE ACRES`
## <dbl> <chr> <dbl>
## 1 2001 Human 0.1
## 2 2002 Human 3
## 3 2002 Human 0.5
## 4 2001 Human 0.1
## 5 1994 Human 1
## 6 1994 Human 0.1
## 7 1999 Human 3
## 8 2003 Human 0.1
## 9 2005 Human 0.1
## 10 2005 Human 0.1
## # ℹ 439,352 more rows
4.Agrupar la información por CAUSE y YEAR_.
df1%>%
group_by(CAUSE)%>%
summarise(media_acres = mean(TOTALACRES),
ds_acres = sd(TOTALACRES),
mediana_acres = median(TOTALACRES),
RIC_acres = IQR(TOTALACRES),
min_acres = min(TOTALACRES),
max_acres = max(TOTALACRES),
q1_acres = quantile(TOTALACRES)[2],
q3_acres = quantile(TOTALACRES)[4]) -> df1_cause
df1%>%
group_by(YEAR_)%>%
summarise(media_acres = mean(TOTALACRES),
ds_acres = sd(TOTALACRES),
mediana_acres = median(TOTALACRES),
RIC_acres = IQR(TOTALACRES),
min_acres = min(TOTALACRES),
max_acres = max(TOTALACRES),
q1_acres = quantile(TOTALACRES)[2],
q3_acres = quantile(TOTALACRES)[4]) -> df1_year
5.Resumir el total de acres quemados para cada combinación de causa y
año.
df1%>%
group_by(CAUSE,YEAR_)%>%
summarise(.groups = "drop_last",media_acres = mean(TOTALACRES),
ds_acres = sd(TOTALACRES),
mediana_acres = median(TOTALACRES),
RIC_acres = IQR(TOTALACRES),
min_acres = min(TOTALACRES),
max_acres = max(TOTALACRES),
q1_acres = quantile(TOTALACRES)[2],
q3_acres = quantile(TOTALACRES)[4]) -> df1_cause_year
6.Elaborar una visualización que muestre los resultados de manera
clara.
library(ggplot2)
df1_cause_year%>%
filter(CAUSE!="Unknown",CAUSE!="Undetermined")%>%
ggplot(aes(x = YEAR_,
y = media_acres,
color = CAUSE)) +
geom_line() +
geom_point() +
labs(
x = "Año",
y = "Promedio de acres quemados",
color = "Causa",
title = "Promedio anual de acres quemados por causa"
) +
theme_minimal()
Ejercicio 2
1.Genere una tabla que indique el número de medallas concedidas en cada
uno de los cinco principales deportes en 2016.
df2 <- read_csv("athlete_events.csv")
## Rows: 271116 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): Name, Sex, Team, NOC, Games, Season, City, Sport, Event, Medal
## dbl (5): ID, Age, Height, Weight, Year
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df2 %>%
group_by(Sport)%>%
filter(Year==2016,!is.na(Medal))%>%
summarise(total_medallas =length(Medal))%>%
arrange(desc(total_medallas))%>%
head(n=5)
## # A tibble: 5 × 2
## Sport total_medallas
## <chr> <int>
## 1 Athletics 192
## 2 Swimming 191
## 3 Rowing 144
## 4 Football 106
## 5 Hockey 99
2.Elabore una tabla que muestre la distribución de la edad de los
ganadores de medallas en los cinco principales deportes en 2016.
df2%>%
filter(Sport %in% c("Athletics","Swimming","Rowing","Football","Hockey"),!is.na(Medal),!is.na(Age),Year==2016)%>%
group_by(Age)%>%
summarise(total_medallas=length(Medal))
## # A tibble: 23 × 2
## Age total_medallas
## <dbl> <int>
## 1 16 6
## 2 17 3
## 3 18 7
## 4 19 34
## 5 20 40
## 6 21 62
## 7 22 64
## 8 23 46
## 9 24 60
## 10 25 62
## # ℹ 13 more rows
3.Identifique qué equipos nacionales ganaron el mayor número de
medallas en los cinco principales deportes en 2016.
df2%>%
filter(Sport %in% c("Athletics","Swimming","Rowing","Football","Hockey"),!is.na(Medal),!is.na(Team),Year==2016)%>%
group_by(Team)%>%
summarise(total_medallas=length(Medal))%>%
arrange(desc(total_medallas))
## # A tibble: 54 × 2
## Team total_medallas
## <chr> <int>
## 1 United States 127
## 2 Germany 88
## 3 Great Britain 69
## 4 Canada 45
## 5 Australia 43
## 6 Netherlands 34
## 7 Jamaica 30
## 8 Sweden 21
## 9 France 20
## 10 Brazil 19
## # ℹ 44 more rows
4.Presente un resumen de la tendencia del peso de los atletas
masculinos y femeninos ganadores en los cinco principales deportes en
2016.
df2%>%
filter(Sport %in% c("Athletics","Swimming","Rowing","Football","Hockey"),!is.na(Weight),Year==2016)%>%
group_by(Sex)%>%
summarise(.groups = "drop_last",media_peso = mean(Weight),
ds_peso = sd(Weight),
mediana_peso = median(Weight),
RIC_peso = IQR(Weight),
min_peso = min(Weight),
max_peso = max(Weight),
q1_peso = quantile(Weight)[2],
q3_peso = quantile(Weight)[4]) -> df2_pesos
df2_pesos
## # A tibble: 2 × 9
## Sex media_peso ds_peso mediana_peso RIC_peso min_peso max_peso q1_peso
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 F 61.9 10.5 60 12 38 136 55
## 2 M 77.7 13.9 76 15 47 165 70
## # ℹ 1 more variable: q3_peso <dbl>
Ejericico 3
1. Recopilación
library(tidyverse)
df3 <- read_tsv("us_state_population.tsv")
## Rows: 51 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): State, Code
## dbl (9): 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_long <- df3 %>%
pivot_longer(
cols = `2010`:`2018`,
names_to = "Year",
values_to = "Population"
)
View(df_long)
df_sep <- df_long %>%
unite("State_Code", State, Code, sep = "-")
df_sep <- df_sep %>%
separate(State_Code, into = c("State", "Code"), sep = "-")
head(df_sep)
## # A tibble: 6 × 4
## State Code Year Population
## <chr> <chr> <chr> <dbl>
## 1 Alabama AL 2010 4785448
## 2 Alabama AL 2011 4798834
## 3 Alabama AL 2012 4815564
## 4 Alabama AL 2013 4830460
## 5 Alabama AL 2014 4842481
## 6 Alabama AL 2015 4853160
4.Unión
df_final <- df_sep %>%
unite("State_Code", State, Code, sep = "_")
head(df_final)
## # A tibble: 6 × 3
## State_Code Year Population
## <chr> <chr> <dbl>
## 1 Alabama_AL 2010 4785448
## 2 Alabama_AL 2011 4798834
## 3 Alabama_AL 2012 4815564
## 4 Alabama_AL 2013 4830460
## 5 Alabama_AL 2014 4842481
## 6 Alabama_AL 2015 4853160
El conjunto de datos original se encontraba en formato ancho, donde los años estaban distribuidos en varias columnas. Mediante el uso de las funciones pivot_longer(), separate() y unite() del paquete tidyr, se reorganizó el dataset siguiendo los principios de datos ordenados. El resultado es un dataframe donde cada variable tiene su propia columna, cada observación su propia fila y cada valor su propia celda, facilitando así su análisis posterior.