setwd("~/Google Drive/Clases/UDLA/Análisis de Datos")
# download data
#if(!file.exists("weather.rds")){
#download.file("https://assets.datacamp.com/production/repositories/34/datasets/b3c1036d9a60a9dfe0f99051d2474a54f76055ea/weather.rds", "weather.rds")
# dateDownloaded <- date()}
# load data
weather <- readRDS('weather.rds')
# Verify that weather is a data.frame
class(weather)
[1] "data.frame"
# Look at the structure using dplyr's glimpse()
library(dplyr)
glimpse(weather)
Observations: 286
Variables: 35
$ X [3m[38;5;246m<int>[39m[23m 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1…
$ year [3m[38;5;246m<int>[39m[23m 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014…
$ month [3m[38;5;246m<int>[39m[23m 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, …
$ measure [3m[38;5;246m<chr>[39m[23m "Max.TemperatureF", "Mean.TemperatureF", "Min.Temper…
$ X1 [3m[38;5;246m<chr>[39m[23m "64", "52", "39", "46", "40", "26", "74", "63", "52"…
$ X2 [3m[38;5;246m<chr>[39m[23m "42", "38", "33", "40", "27", "17", "92", "72", "51"…
$ X3 [3m[38;5;246m<chr>[39m[23m "51", "44", "37", "49", "42", "24", "100", "79", "57…
$ X4 [3m[38;5;246m<chr>[39m[23m "43", "37", "30", "24", "21", "13", "69", "54", "39"…
$ X5 [3m[38;5;246m<chr>[39m[23m "42", "34", "26", "37", "25", "12", "85", "66", "47"…
$ X6 [3m[38;5;246m<chr>[39m[23m "45", "42", "38", "45", "40", "36", "100", "93", "85…
$ X7 [3m[38;5;246m<chr>[39m[23m "38", "30", "21", "36", "20", "-3", "92", "61", "29"…
$ X8 [3m[38;5;246m<chr>[39m[23m "29", "24", "18", "28", "16", "3", "92", "70", "47",…
$ X9 [3m[38;5;246m<chr>[39m[23m "49", "39", "29", "49", "41", "28", "100", "93", "86…
$ X10 [3m[38;5;246m<chr>[39m[23m "48", "43", "38", "45", "39", "37", "100", "95", "89…
$ X11 [3m[38;5;246m<chr>[39m[23m "39", "36", "32", "37", "31", "27", "92", "87", "82"…
$ X12 [3m[38;5;246m<chr>[39m[23m "39", "35", "31", "28", "27", "25", "85", "75", "64"…
$ X13 [3m[38;5;246m<chr>[39m[23m "42", "37", "32", "28", "26", "24", "75", "65", "55"…
$ X14 [3m[38;5;246m<chr>[39m[23m "45", "39", "33", "29", "27", "25", "82", "68", "53"…
$ X15 [3m[38;5;246m<chr>[39m[23m "42", "37", "32", "33", "29", "27", "89", "75", "60"…
$ X16 [3m[38;5;246m<chr>[39m[23m "44", "40", "35", "42", "36", "30", "96", "85", "73"…
$ X17 [3m[38;5;246m<chr>[39m[23m "49", "45", "41", "46", "41", "32", "100", "85", "70…
$ X18 [3m[38;5;246m<chr>[39m[23m "44", "40", "36", "34", "30", "26", "89", "73", "57"…
$ X19 [3m[38;5;246m<chr>[39m[23m "37", "33", "29", "25", "22", "20", "69", "63", "56"…
$ X20 [3m[38;5;246m<chr>[39m[23m "36", "32", "27", "30", "24", "20", "89", "79", "69"…
$ X21 [3m[38;5;246m<chr>[39m[23m "36", "33", "30", "30", "27", "25", "85", "77", "69"…
$ X22 [3m[38;5;246m<chr>[39m[23m "44", "39", "33", "39", "34", "25", "89", "79", "69"…
$ X23 [3m[38;5;246m<chr>[39m[23m "47", "45", "42", "45", "42", "37", "100", "91", "82…
$ X24 [3m[38;5;246m<chr>[39m[23m "46", "44", "41", "46", "44", "41", "100", "98", "96…
$ X25 [3m[38;5;246m<chr>[39m[23m "59", "52", "44", "58", "43", "29", "100", "75", "49…
$ X26 [3m[38;5;246m<chr>[39m[23m "50", "44", "37", "31", "29", "28", "70", "60", "49"…
$ X27 [3m[38;5;246m<chr>[39m[23m "52", "45", "38", "34", "31", "29", "70", "60", "50"…
$ X28 [3m[38;5;246m<chr>[39m[23m "52", "46", "40", "42", "35", "27", "76", "65", "53"…
$ X29 [3m[38;5;246m<chr>[39m[23m "41", "36", "30", "26", "20", "10", "64", "51", "37"…
$ X30 [3m[38;5;246m<chr>[39m[23m "30", "26", "22", "10", "4", "-6", "50", "38", "26",…
$ X31 [3m[38;5;246m<chr>[39m[23m "30", "25", "20", "8", "5", "1", "57", "44", "31", "…
# View first 10 rows
head(weather, n = 10)
Las cabeceras de columna son valores, no nombres de variables: de X1 a X31 Columna sin sentido: X como índice de observación Las variables se almacenan en ambas columnas: medida de variables con temperaturas máximas, mínimas y medias.
# Load the tidyr package
if(!require(tidyr)){install.packages('tidyr')}
# Gather the columns
weather2 <- gather(weather, day, value, X1:X31, na.rm = TRUE)
# View the head
head(weather2)
# First remove column of row names
weather2 <- weather2[, -1]
# Spread the data
weather3 <- spread(weather2, measure, value)
# View the head
head(weather3)
day con innecesarios caracteres “x” Fecha pueden ser unidad con (yyyy-mm-dd)
# Load the stringr and lubridate packages
if(!require(stringr)){install.packages('stringr')}
if(!require(lubridate)){install.packages('lubridate')}
# Remove X's from day column
weather3$day <- str_replace(weather3$day, "X", "")
# Unite the year, month, and day columns
weather4 <- unite(weather3, date, year, month, day, sep = "-")
# Convert date column to proper date format using lubridates's ymd()
weather4$date <- ymd(weather4$date)
# Rearrange columns using dplyr's select()
weather5 <- weather4 %>%
select(date, Events, CloudCover:WindDirDegrees)
# View the head of weather5
head(weather5)
Variable types are not always correct: most characters should be coerced into numerics (for PrecipitationIn, there are “T” values indicating “Trace”, or 0 in number).
str(weather5)
'data.frame': 366 obs. of 23 variables:
$ date : Date, format: "2014-12-01" ...
$ Events : chr "Rain" "Rain" "Rain-Snow" "Snow" ...
$ CloudCover : chr "6" "8" "8" "7" ...
$ Max.Dew.PointF : chr "46" "45" "37" "28" ...
$ Max.Gust.SpeedMPH : chr "29" "29" "28" "21" ...
$ Max.Humidity : chr "74" "100" "92" "85" ...
$ Max.Sea.Level.PressureIn : chr "30.45" "29.58" "29.81" "29.88" ...
$ Max.TemperatureF : chr "64" "48" "39" "39" ...
$ Max.VisibilityMiles : chr "10" "10" "10" "10" ...
$ Max.Wind.SpeedMPH : chr "22" "23" "21" "16" ...
$ Mean.Humidity : chr "63" "95" "87" "75" ...
$ Mean.Sea.Level.PressureIn: chr "30.13" "29.5" "29.61" "29.85" ...
$ Mean.TemperatureF : chr "52" "43" "36" "35" ...
$ Mean.VisibilityMiles : chr "10" "3" "7" "10" ...
$ Mean.Wind.SpeedMPH : chr "13" "13" "13" "11" ...
$ MeanDew.PointF : chr "40" "39" "31" "27" ...
$ Min.DewpointF : chr "26" "37" "27" "25" ...
$ Min.Humidity : chr "52" "89" "82" "64" ...
$ Min.Sea.Level.PressureIn : chr "30.01" "29.43" "29.44" "29.81" ...
$ Min.TemperatureF : chr "39" "38" "32" "31" ...
$ Min.VisibilityMiles : chr "10" "1" "1" "7" ...
$ PrecipitationIn : chr "0.01" "0.28" "0.02" "T" ...
$ WindDirDegrees : chr "268" "357" "230" "286" ...
# Replace "T" with "0" (T = trace)
weather5$PrecipitationIn <- str_replace(weather5$Precipitation, "T", "0")
# Convert characters to numerics
weather6 <- mutate_at(weather5, vars(CloudCover:WindDirDegrees), funs(as.numeric))
funs() is soft deprecated as of dplyr 0.8.0
please use list() instead
# Before:
funs(name = f(.)
# After:
list(name = ~f(.))
[90mThis warning is displayed once per session.[39m
# Look at result
str(weather6)
'data.frame': 366 obs. of 23 variables:
$ date : Date, format: "2014-12-01" ...
$ Events : chr "Rain" "Rain" "Rain-Snow" "Snow" ...
$ CloudCover : num 6 8 8 7 5 4 2 8 8 7 ...
$ Max.Dew.PointF : num 46 45 37 28 28 29 33 42 46 34 ...
$ Max.Gust.SpeedMPH : num 29 29 28 21 23 20 21 10 26 30 ...
$ Max.Humidity : num 74 100 92 85 75 82 89 96 100 89 ...
$ Max.Sea.Level.PressureIn : num 30.4 29.6 29.8 29.9 29.9 ...
$ Max.TemperatureF : num 64 48 39 39 42 45 42 44 49 44 ...
$ Max.VisibilityMiles : num 10 10 10 10 10 10 10 10 10 10 ...
$ Max.Wind.SpeedMPH : num 22 23 21 16 17 15 15 8 20 23 ...
$ Mean.Humidity : num 63 95 87 75 65 68 75 85 85 73 ...
$ Mean.Sea.Level.PressureIn: num 30.1 29.5 29.6 29.9 29.8 ...
$ Mean.TemperatureF : num 52 43 36 35 37 39 37 40 45 40 ...
$ Mean.VisibilityMiles : num 10 3 7 10 10 10 10 9 6 10 ...
$ Mean.Wind.SpeedMPH : num 13 13 13 11 12 10 6 4 11 14 ...
$ MeanDew.PointF : num 40 39 31 27 26 27 29 36 41 30 ...
$ Min.DewpointF : num 26 37 27 25 24 25 27 30 32 26 ...
$ Min.Humidity : num 52 89 82 64 55 53 60 73 70 57 ...
$ Min.Sea.Level.PressureIn : num 30 29.4 29.4 29.8 29.8 ...
$ Min.TemperatureF : num 39 38 32 31 32 33 32 35 41 36 ...
$ Min.VisibilityMiles : num 10 1 1 7 10 10 10 5 1 10 ...
$ PrecipitationIn : num 0.01 0.28 0.02 0 0 0 0 0 0.43 0.01 ...
$ WindDirDegrees : num 268 357 230 286 298 306 324 79 311 281 ...
Finding missing values: in Max.Gust.SpeedMPH encontar errores obvios: in Max.Humidity and Mean.VisibilityMiles cadenas vacías: in Events
summary(weather6)
date Events CloudCover
Min. :2014-12-01 Length:366 Min. :0.000
1st Qu.:2015-03-02 Class :character 1st Qu.:3.000
Median :2015-06-01 Mode :character Median :5.000
Mean :2015-06-01 Mean :4.708
3rd Qu.:2015-08-31 3rd Qu.:7.000
Max. :2015-12-01 Max. :8.000
Max.Dew.PointF Max.Gust.SpeedMPH Max.Humidity
Min. :-6.00 Min. : 0.00 Min. : 39.00
1st Qu.:32.00 1st Qu.:21.00 1st Qu.: 73.25
Median :47.50 Median :25.50 Median : 86.00
Mean :45.48 Mean :26.99 Mean : 85.69
3rd Qu.:61.00 3rd Qu.:31.25 3rd Qu.: 93.00
Max. :75.00 Max. :94.00 Max. :1000.00
NA's :6
Max.Sea.Level.PressureIn Max.TemperatureF Max.VisibilityMiles
Min. :29.58 Min. :18.00 Min. : 2.000
1st Qu.:30.00 1st Qu.:42.00 1st Qu.:10.000
Median :30.14 Median :60.00 Median :10.000
Mean :30.16 Mean :58.93 Mean : 9.907
3rd Qu.:30.31 3rd Qu.:76.00 3rd Qu.:10.000
Max. :30.88 Max. :96.00 Max. :10.000
Max.Wind.SpeedMPH Mean.Humidity Mean.Sea.Level.PressureIn
Min. : 8.00 Min. :28.00 Min. :29.49
1st Qu.:16.00 1st Qu.:56.00 1st Qu.:29.87
Median :20.00 Median :66.00 Median :30.03
Mean :20.62 Mean :66.02 Mean :30.04
3rd Qu.:24.00 3rd Qu.:76.75 3rd Qu.:30.19
Max. :38.00 Max. :98.00 Max. :30.77
Mean.TemperatureF Mean.VisibilityMiles Mean.Wind.SpeedMPH
Min. : 8.00 Min. :-1.000 Min. : 4.00
1st Qu.:36.25 1st Qu.: 8.000 1st Qu.: 8.00
Median :53.50 Median :10.000 Median :10.00
Mean :51.40 Mean : 8.861 Mean :10.68
3rd Qu.:68.00 3rd Qu.:10.000 3rd Qu.:13.00
Max. :84.00 Max. :10.000 Max. :22.00
MeanDew.PointF Min.DewpointF Min.Humidity
Min. :-11.00 Min. :-18.00 Min. :16.00
1st Qu.: 24.00 1st Qu.: 16.25 1st Qu.:35.00
Median : 41.00 Median : 35.00 Median :46.00
Mean : 38.96 Mean : 32.25 Mean :48.31
3rd Qu.: 56.00 3rd Qu.: 51.00 3rd Qu.:60.00
Max. : 71.00 Max. : 68.00 Max. :96.00
Min.Sea.Level.PressureIn Min.TemperatureF Min.VisibilityMiles
Min. :29.16 Min. :-3.00 Min. : 0.000
1st Qu.:29.76 1st Qu.:30.00 1st Qu.: 2.000
Median :29.94 Median :46.00 Median :10.000
Mean :29.93 Mean :43.33 Mean : 6.716
3rd Qu.:30.09 3rd Qu.:60.00 3rd Qu.:10.000
Max. :30.64 Max. :74.00 Max. :10.000
PrecipitationIn WindDirDegrees
Min. :0.0000 Min. : 1.0
1st Qu.:0.0000 1st Qu.:113.0
Median :0.0000 Median :222.0
Mean :0.1016 Mean :200.1
3rd Qu.:0.0400 3rd Qu.:275.0
Max. :2.9000 Max. :360.0
# Look at the full rows for records missing Max.Gust.SpeedMPH
weather6[is.na(weather6$Max.Gust.SpeedMPH), ]
weather6$Events
[1] "Rain" "Rain"
[3] "Rain-Snow" "Snow"
[5] "None" "None"
[7] "None" "Rain"
[9] "Rain" "Rain"
[11] "None" "Rain-Snow"
[13] "Snow" "Snow"
[15] "Rain" "Rain"
[17] "Fog-Rain" "Rain"
[19] "None" "None"
[21] "Rain" "None"
[23] "Rain" "None"
[25] "None" "None"
[27] "Rain" "Rain"
[29] "Rain" "Snow"
[31] "Rain" "None"
[33] "None" "None"
[35] "Rain" "None"
[37] "Snow" "Fog-Snow"
[39] "None" "None"
[41] "Rain" "None"
[43] "None" "None"
[45] "None" "Snow"
[47] "None" "Fog-Rain-Snow"
[49] "None" "Fog-Snow"
[51] "Fog-Snow" "Snow"
[53] "None" "Rain-Snow"
[55] "Snow" "Snow"
[57] "Fog-Rain" "None"
[59] "Snow" "Snow"
[61] "None" "Snow"
[63] "None" "Snow"
[65] "Snow" "Fog-Snow"
[67] "None" "Snow"
[69] "Fog-Snow" "Snow"
[71] "Snow" "Snow"
[73] "Snow" "Fog-Snow"
[75] "None" "Rain-Snow"
[77] "Rain-Snow" "None"
[79] "Snow" "Snow"
[81] "Snow" "None"
[83] "None" "Snow"
[85] "None" "Fog-Rain-Snow"
[87] "None" "Snow"
[89] "Snow" "Fog-Snow"
[91] "Snow" "Rain"
[93] "Fog-Rain" "None"
[95] "None" "Fog-Rain"
[97] "Fog-Rain-Snow" "None"
[99] "Rain-Snow" "None"
[101] "None" "Rain-Snow"
[103] "Snow" "Snow"
[105] "None" "None"
[107] "None" "Rain"
[109] "Fog-Rain" "Rain"
[111] "Fog-Rain-Snow" "None"
[113] "Rain-Snow" "Rain-Snow"
[115] "None" "Rain"
[117] "Snow" "None"
[119] "None" "Snow"
[121] "None" "None"
[123] "Rain" "None"
[125] "None" "None"
[127] "Rain" "None"
[129] "None" "Rain"
[131] "None" "None"
[133] "None" "Rain"
[135] "Fog-Rain-Thunderstorm" "Rain"
[137] "None" "Snow"
[139] "None" "None"
[141] "Rain" "Rain"
[143] "None" "Rain"
[145] "None" "Fog-Rain-Thunderstorm"
[147] "None" "None"
[149] "Rain" "Rain-Snow"
[151] "Rain" "None"
[153] "None" "None"
[155] "Rain" "None"
[157] "None" "None"
[159] "None" "None"
[161] "Fog" "Rain"
[163] "None" "None"
[165] "None" "Rain"
[167] "None" "None"
[169] "Rain" "None"
[171] "None" "Rain"
[173] "Fog" "None"
[175] "None" "Rain"
[177] "None" "Rain"
[179] "None" "None"
[181] "None" "None"
[183] "Rain" "None"
[185] "None" "None"
[187] "None" "None"
[189] "Rain" "Rain"
[191] "None" "None"
[193] "None" "Rain"
[195] "None" "Rain"
[197] "Fog" "Rain"
[199] "None" "None"
[201] "Rain" "Rain"
[203] "Rain" "None"
[205] "None" "None"
[207] "None" "None"
[209] "Rain" "None"
[211] "Rain" "Rain"
[213] "Rain-Thunderstorm" "Rain"
[215] "None" "None"
[217] "None" "Rain"
[219] "Rain" "None"
[221] "None" "Rain"
[223] "Rain" "None"
[225] "None" "None"
[227] "None" "None"
[229] "Rain" "None"
[231] "Rain" "Rain"
[233] "None" "None"
[235] "None" "Rain"
[237] "None" "None"
[239] "None" "None"
[241] "Rain" "Rain"
[243] "Rain" "Rain"
[245] "None" "Rain"
[247] "None" "None"
[249] "None" "Rain-Thunderstorm"
[251] "None" "None"
[253] "Rain-Thunderstorm" "None"
[255] "None" "None"
[257] "Fog-Rain-Thunderstorm" "Fog"
[259] "Rain" "None"
[261] "Fog" "Rain"
[263] "None" "None"
[265] "None" "Thunderstorm"
[267] "None" "None"
[269] "Fog-Rain-Hail-Thunderstorm" "None"
[271] "None" "None"
[273] "None" "None"
[275] "None" "Rain"
[277] "Rain" "Rain"
[279] "Rain" "None"
[281] "None" "None"
[283] "None" "None"
[285] "None" "None"
[287] "None" "None"
[289] "None" "None"
[291] "None" "None"
[293] "None" "None"
[295] "None" "Fog-Rain"
[297] "Rain-Thunderstorm" "Fog-Rain"
[299] "Rain" "None"
[301] "None" "None"
[303] "None" "None"
[305] "Rain" "None"
[307] "None" "None"
[309] "Fog-Rain" "None"
[311] "None" "Rain"
[313] "None" "None"
[315] "None" "Rain"
[317] "None" "Rain"
[319] "Fog" "None"
[321] "None" "Rain"
[323] "None" "None"
[325] "Rain" "Rain"
[327] "Rain" "None"
[329] "None" "None"
[331] "None" "None"
[333] "None" "None"
[335] "Rain" "Rain"
[337] "Rain" "Rain"
[339] "Rain" "Rain"
[341] "None" "None"
[343] "None" "None"
[345] "None" "Rain"
[347] "None" "Rain"
[349] "None" "Rain"
[351] "Rain" "None"
[353] "None" "None"
[355] "None" "Rain"
[357] "None" "None"
[359] "None" "None"
[361] "None" "None"
[363] "None" "None"
[365] "None" "Rain"
# Look at the full row for record with Max.Humidity of 1000 which is an obvious error: humidity from 0 to 100, and use 100 instear
weather6$Max.Humidity[weather6$Max.Humidity == 1000] <- 100
# Look at full row with -1 value for Mean.VisibilityMiles, which should be above 0, and use 10 instead
weather6$Mean.VisibilityMiles[weather6$Mean.VisibilityMiles == -1] <- 10
# Replace empty cells in events column
weather6$Events[weather6$Events == ""] <- "None"