This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE
parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.
Code practice 1 Step 1
library(hflights)
summary(hflights)
## Year Month DayofMonth DayOfWeek DepTime
## Min. :2011 Min. : 1.000 Min. : 1.00 Min. :1.000 Min. : 1
## 1st Qu.:2011 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.:2.000 1st Qu.:1021
## Median :2011 Median : 7.000 Median :16.00 Median :4.000 Median :1416
## Mean :2011 Mean : 6.514 Mean :15.74 Mean :3.948 Mean :1396
## 3rd Qu.:2011 3rd Qu.: 9.000 3rd Qu.:23.00 3rd Qu.:6.000 3rd Qu.:1801
## Max. :2011 Max. :12.000 Max. :31.00 Max. :7.000 Max. :2400
## NA's :2905
## ArrTime UniqueCarrier FlightNum TailNum
## Min. : 1 Length:227496 Min. : 1 Length:227496
## 1st Qu.:1215 Class :character 1st Qu.: 855 Class :character
## Median :1617 Mode :character Median :1696 Mode :character
## Mean :1578 Mean :1962
## 3rd Qu.:1953 3rd Qu.:2755
## Max. :2400 Max. :7290
## NA's :3066
## ActualElapsedTime AirTime ArrDelay DepDelay
## Min. : 34.0 Min. : 11.0 Min. :-70.000 Min. :-33.000
## 1st Qu.: 77.0 1st Qu.: 58.0 1st Qu.: -8.000 1st Qu.: -3.000
## Median :128.0 Median :107.0 Median : 0.000 Median : 0.000
## Mean :129.3 Mean :108.1 Mean : 7.094 Mean : 9.445
## 3rd Qu.:165.0 3rd Qu.:141.0 3rd Qu.: 11.000 3rd Qu.: 9.000
## Max. :575.0 Max. :549.0 Max. :978.000 Max. :981.000
## NA's :3622 NA's :3622 NA's :3622 NA's :2905
## Origin Dest Distance TaxiIn
## Length:227496 Length:227496 Min. : 79.0 Min. : 1.000
## Class :character Class :character 1st Qu.: 376.0 1st Qu.: 4.000
## Mode :character Mode :character Median : 809.0 Median : 5.000
## Mean : 787.8 Mean : 6.099
## 3rd Qu.:1042.0 3rd Qu.: 7.000
## Max. :3904.0 Max. :165.000
## NA's :3066
## TaxiOut Cancelled CancellationCode Diverted
## Min. : 1.00 Min. :0.00000 Length:227496 Min. :0.000000
## 1st Qu.: 10.00 1st Qu.:0.00000 Class :character 1st Qu.:0.000000
## Median : 14.00 Median :0.00000 Mode :character Median :0.000000
## Mean : 15.09 Mean :0.01307 Mean :0.002853
## 3rd Qu.: 18.00 3rd Qu.:0.00000 3rd Qu.:0.000000
## Max. :163.00 Max. :1.00000 Max. :1.000000
## NA's :2947
sum(is.na(hflights)==TRUE)
## [1] 25755
table(complete.cases(hflights))
##
## FALSE TRUE
## 3622 223874
prop.table(table(complete.cases(hflights))) * 100
##
## FALSE TRUE
## 1.592116 98.407884
sort(sapply(hflights, function(x) sum(is.na(x))))
## Year Month DayofMonth DayOfWeek
## 0 0 0 0
## UniqueCarrier FlightNum TailNum Origin
## 0 0 0 0
## Dest Distance Cancelled CancellationCode
## 0 0 0 0
## Diverted DepTime DepDelay TaxiOut
## 0 2905 2905 2947
## ArrTime TaxiIn ActualElapsedTime AirTime
## 3066 3066 3622 3622
## ArrDelay
## 3622
Step 2
library(visdat)
## Warning: package 'visdat' was built under R version 4.5.1
vis_dat(airquality)
vis_miss(airquality)
library(ggplot2)
ggplot(airquality,
aes(x= Solar.R,
y= Ozone))+
geom_point()
## Warning: Removed 42 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggplot(airquality,
aes(x= Solar.R,
y= Ozone))+
geom_point()
## Warning: Removed 42 rows containing missing values or values outside the scale range
## (`geom_point()`).
library(naniar)
## Warning: package 'naniar' was built under R version 4.5.1
ggplot(airquality,
aes(x= Solar.R,
y= Ozone))+
geom_miss_point()
ggplot(airquality,
aes(x= Solar.R,
y= Ozone))+
geom_miss_point() +
facet_wrap(~Month)
ggplot(airquality,
aes(x= Solar.R,
y= Ozone))+
geom_miss_point() +
facet_wrap(~Month) +
theme_dark()
gg_miss_var(airquality)
gg_miss_var(airquality) + theme_bw()
gg_miss_var(airquality) + labs(y= "look at all the missing ones")
gg_miss_var(airquality, facet = Month)
as_shadow(airquality)
## # A tibble: 153 × 6
## Ozone_NA Solar.R_NA Wind_NA Temp_NA Month_NA Day_NA
## <fct> <fct> <fct> <fct> <fct> <fct>
## 1 !NA !NA !NA !NA !NA !NA
## 2 !NA !NA !NA !NA !NA !NA
## 3 !NA !NA !NA !NA !NA !NA
## 4 !NA !NA !NA !NA !NA !NA
## 5 NA NA !NA !NA !NA !NA
## 6 !NA NA !NA !NA !NA !NA
## 7 !NA !NA !NA !NA !NA !NA
## 8 !NA !NA !NA !NA !NA !NA
## 9 !NA !NA !NA !NA !NA !NA
## 10 NA !NA !NA !NA !NA !NA
## # ℹ 143 more rows
aq_shadow <- bind_shadow(airquality)
aq_nab <- nabular(airquality)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
glimpse(aq_shadow)
## Rows: 153
## Columns: 12
## $ Ozone <int> 41, 36, 12, 18, NA, 28, 23, 19, 8, NA, 7, 16, 11, 14, 18, 1…
## $ Solar.R <int> 190, 118, 149, 313, NA, NA, 299, 99, 19, 194, NA, 256, 290,…
## $ Wind <dbl> 7.4, 8.0, 12.6, 11.5, 14.3, 14.9, 8.6, 13.8, 20.1, 8.6, 6.9…
## $ Temp <int> 67, 72, 74, 62, 56, 66, 65, 59, 61, 69, 74, 69, 66, 68, 58,…
## $ Month <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,…
## $ Day <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, …
## $ Ozone_NA <fct> !NA, !NA, !NA, !NA, NA, !NA, !NA, !NA, !NA, NA, !NA, !NA, !…
## $ Solar.R_NA <fct> !NA, !NA, !NA, !NA, NA, NA, !NA, !NA, !NA, !NA, NA, !NA, !N…
## $ Wind_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Temp_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Month_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Day_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
glimpse(aq_nab)
## Rows: 153
## Columns: 12
## $ Ozone <int> 41, 36, 12, 18, NA, 28, 23, 19, 8, NA, 7, 16, 11, 14, 18, 1…
## $ Solar.R <int> 190, 118, 149, 313, NA, NA, 299, 99, 19, 194, NA, 256, 290,…
## $ Wind <dbl> 7.4, 8.0, 12.6, 11.5, 14.3, 14.9, 8.6, 13.8, 20.1, 8.6, 6.9…
## $ Temp <int> 67, 72, 74, 62, 56, 66, 65, 59, 61, 69, 74, 69, 66, 68, 58,…
## $ Month <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,…
## $ Day <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, …
## $ Ozone_NA <fct> !NA, !NA, !NA, !NA, NA, !NA, !NA, !NA, !NA, NA, !NA, !NA, !…
## $ Solar.R_NA <fct> !NA, !NA, !NA, !NA, NA, NA, !NA, !NA, !NA, !NA, NA, !NA, !N…
## $ Wind_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Temp_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Month_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Day_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
all.equal(aq_shadow, aq_nab)
## [1] TRUE
airquality%>%
bind_shadow()%>%
group_by(Ozone_NA)%>%
summarise_at(.vars = "Solar.R",
.funs = c("mean", "sd", "var", "min", "max"),
na.rm = TRUE)
## # A tibble: 2 × 6
## Ozone_NA mean sd var min max
## <fct> <dbl> <dbl> <dbl> <int> <int>
## 1 !NA 185. 91.2 8309. 7 334
## 2 NA 190. 87.7 7690. 31 332
ggplot(aq_shadow,
aes(x = Temp,
colour = Ozone_NA)) +
geom_density()
oceanbuoys %>%
bind_shadow()%>%
ggplot(aes(x= air_temp_c,
fill = humidity_NA)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 81 rows containing non-finite outside the scale range
## (`stat_bin()`).
oceanbuoys %>%
bind_shadow()%>%
ggplot(aes(x= humidity,
fill = air_temp_c_NA)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 93 rows containing non-finite outside the scale range
## (`stat_bin()`).
library (simputation)
## Warning: package 'simputation' was built under R version 4.5.1
##
## Attaching package: 'simputation'
## The following object is masked from 'package:naniar':
##
## impute_median
library(dplyr)
airquality %>%
impute_lm(Ozone ~ Temp + Wind) %>%
ggplot(aes(x= Temp,
y= Ozone)) +
geom_point()
aq_shadow %>%
as.data.frame() %>%
impute_lm(Ozone ~ Temp + Wind) %>%
ggplot(aes(x= Temp,
y = Ozone,
colour= Ozone_NA)) +
geom_point()
dplyr::n_distinct(airquality)
## [1] 153
dplyr::n_distinct(airquality$Ozone)
## [1] 68
n_miss(airquality)
## [1] 44
n_miss(airquality$Ozone)
## [1] 37
n_complete(airquality)
## [1] 874
n_complete(airquality$Ozone)
## [1] 116
prop_miss_case(airquality)
## [1] 0.2745098
pct_miss_case(airquality)
## [1] 27.45098
miss_case_summary(airquality)
## # A tibble: 153 × 3
## case n_miss pct_miss
## <int> <int> <dbl>
## 1 5 2 33.3
## 2 27 2 33.3
## 3 6 1 16.7
## 4 10 1 16.7
## 5 11 1 16.7
## 6 25 1 16.7
## 7 26 1 16.7
## 8 32 1 16.7
## 9 33 1 16.7
## 10 34 1 16.7
## # ℹ 143 more rows
miss_case_table(airquality)
## # A tibble: 3 × 3
## n_miss_in_case n_cases pct_cases
## <int> <int> <dbl>
## 1 0 111 72.5
## 2 1 40 26.1
## 3 2 2 1.31
prop_miss_var(airquality)
## [1] 0.3333333
pct_miss_var(airquality)
## [1] 33.33333
miss_var_summary(airquality)
## # A tibble: 6 × 3
## variable n_miss pct_miss
## <chr> <int> <num>
## 1 Ozone 37 24.2
## 2 Solar.R 7 4.58
## 3 Wind 0 0
## 4 Temp 0 0
## 5 Month 0 0
## 6 Day 0 0
miss_var_table(airquality)
## # A tibble: 3 × 3
## n_miss_in_var n_vars pct_vars
## <int> <int> <dbl>
## 1 0 4 66.7
## 2 7 1 16.7
## 3 37 1 16.7
miss_var_run(pedestrian, hourly_counts)
## # A tibble: 35 × 2
## run_length is_na
## <int> <chr>
## 1 6628 complete
## 2 1 missing
## 3 5250 complete
## 4 624 missing
## 5 3652 complete
## 6 1 missing
## 7 1290 complete
## 8 744 missing
## 9 7420 complete
## 10 1 missing
## # ℹ 25 more rows
miss_var_span(pedestrian,
hourly_counts,
span_every = 100)
## # A tibble: 377 × 6
## span_counter n_miss n_complete prop_miss prop_complete n_in_span
## <int> <int> <int> <dbl> <dbl> <int>
## 1 1 0 100 0 1 100
## 2 2 0 100 0 1 100
## 3 3 0 100 0 1 100
## 4 4 0 100 0 1 100
## 5 5 0 100 0 1 100
## 6 6 0 100 0 1 100
## 7 7 0 100 0 1 100
## 8 8 0 100 0 1 100
## 9 9 0 100 0 1 100
## 10 10 0 100 0 1 100
## # ℹ 367 more rows
pedestrian %>% miss_var_summary()
## # A tibble: 9 × 3
## variable n_miss pct_miss
## <chr> <int> <num>
## 1 hourly_counts 2548 6.76
## 2 date_time 0 0
## 3 year 0 0
## 4 month 0 0
## 5 month_day 0 0
## 6 week_day 0 0
## 7 hour 0 0
## 8 sensor_id 0 0
## 9 sensor_name 0 0
pedestrian %>%
group_by(month) %>%
miss_var_summary() %>%
filter(variable == "hourly_counts")
## # A tibble: 12 × 4
## # Groups: month [12]
## month variable n_miss pct_miss
## <ord> <chr> <int> <num>
## 1 January hourly_counts 0 0
## 2 February hourly_counts 0 0
## 3 March hourly_counts 0 0
## 4 April hourly_counts 552 19.2
## 5 May hourly_counts 72 2.42
## 6 June hourly_counts 0 0
## 7 July hourly_counts 0 0
## 8 August hourly_counts 408 13.7
## 9 September hourly_counts 0 0
## 10 October hourly_counts 412 7.44
## 11 November hourly_counts 888 30.8
## 12 December hourly_counts 216 7.26
airquality %>%
add_prop_miss() %>%
head()
## Ozone Solar.R Wind Temp Month Day prop_miss_all
## 1 41 190 7.4 67 5 1 0.0000000
## 2 36 118 8.0 72 5 2 0.0000000
## 3 12 149 12.6 74 5 3 0.0000000
## 4 18 313 11.5 62 5 4 0.0000000
## 5 NA NA 14.3 56 5 5 0.3333333
## 6 28 NA 14.9 66 5 6 0.1666667
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.5.1
airquality %>%
add_prop_miss() %>%
rpart(prop_miss_all ~ ., data = .) %>%
prp(type = 4, extra = 101, prefix = "prop.Miss = ")
## Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
## To silence this warning:
## Call prp with roundint=FALSE,
## or rebuild the rpart model with model=TRUE.