R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

Code practice 1 Step 1

library(hflights)
summary(hflights)
##       Year          Month          DayofMonth      DayOfWeek        DepTime    
##  Min.   :2011   Min.   : 1.000   Min.   : 1.00   Min.   :1.000   Min.   :   1  
##  1st Qu.:2011   1st Qu.: 4.000   1st Qu.: 8.00   1st Qu.:2.000   1st Qu.:1021  
##  Median :2011   Median : 7.000   Median :16.00   Median :4.000   Median :1416  
##  Mean   :2011   Mean   : 6.514   Mean   :15.74   Mean   :3.948   Mean   :1396  
##  3rd Qu.:2011   3rd Qu.: 9.000   3rd Qu.:23.00   3rd Qu.:6.000   3rd Qu.:1801  
##  Max.   :2011   Max.   :12.000   Max.   :31.00   Max.   :7.000   Max.   :2400  
##                                                                  NA's   :2905  
##     ArrTime     UniqueCarrier        FlightNum      TailNum         
##  Min.   :   1   Length:227496      Min.   :   1   Length:227496     
##  1st Qu.:1215   Class :character   1st Qu.: 855   Class :character  
##  Median :1617   Mode  :character   Median :1696   Mode  :character  
##  Mean   :1578                      Mean   :1962                     
##  3rd Qu.:1953                      3rd Qu.:2755                     
##  Max.   :2400                      Max.   :7290                     
##  NA's   :3066                                                       
##  ActualElapsedTime    AirTime         ArrDelay          DepDelay      
##  Min.   : 34.0     Min.   : 11.0   Min.   :-70.000   Min.   :-33.000  
##  1st Qu.: 77.0     1st Qu.: 58.0   1st Qu.: -8.000   1st Qu.: -3.000  
##  Median :128.0     Median :107.0   Median :  0.000   Median :  0.000  
##  Mean   :129.3     Mean   :108.1   Mean   :  7.094   Mean   :  9.445  
##  3rd Qu.:165.0     3rd Qu.:141.0   3rd Qu.: 11.000   3rd Qu.:  9.000  
##  Max.   :575.0     Max.   :549.0   Max.   :978.000   Max.   :981.000  
##  NA's   :3622      NA's   :3622    NA's   :3622      NA's   :2905     
##     Origin              Dest              Distance          TaxiIn       
##  Length:227496      Length:227496      Min.   :  79.0   Min.   :  1.000  
##  Class :character   Class :character   1st Qu.: 376.0   1st Qu.:  4.000  
##  Mode  :character   Mode  :character   Median : 809.0   Median :  5.000  
##                                        Mean   : 787.8   Mean   :  6.099  
##                                        3rd Qu.:1042.0   3rd Qu.:  7.000  
##                                        Max.   :3904.0   Max.   :165.000  
##                                                         NA's   :3066     
##     TaxiOut         Cancelled       CancellationCode      Diverted       
##  Min.   :  1.00   Min.   :0.00000   Length:227496      Min.   :0.000000  
##  1st Qu.: 10.00   1st Qu.:0.00000   Class :character   1st Qu.:0.000000  
##  Median : 14.00   Median :0.00000   Mode  :character   Median :0.000000  
##  Mean   : 15.09   Mean   :0.01307                      Mean   :0.002853  
##  3rd Qu.: 18.00   3rd Qu.:0.00000                      3rd Qu.:0.000000  
##  Max.   :163.00   Max.   :1.00000                      Max.   :1.000000  
##  NA's   :2947
sum(is.na(hflights)==TRUE)
## [1] 25755
table(complete.cases(hflights))
## 
##  FALSE   TRUE 
##   3622 223874
prop.table(table(complete.cases(hflights))) * 100
## 
##     FALSE      TRUE 
##  1.592116 98.407884
sort(sapply(hflights, function(x) sum(is.na(x))))
##              Year             Month        DayofMonth         DayOfWeek 
##                 0                 0                 0                 0 
##     UniqueCarrier         FlightNum           TailNum            Origin 
##                 0                 0                 0                 0 
##              Dest          Distance         Cancelled  CancellationCode 
##                 0                 0                 0                 0 
##          Diverted           DepTime          DepDelay           TaxiOut 
##                 0              2905              2905              2947 
##           ArrTime            TaxiIn ActualElapsedTime           AirTime 
##              3066              3066              3622              3622 
##          ArrDelay 
##              3622

Step 2

library(visdat)
## Warning: package 'visdat' was built under R version 4.5.1
vis_dat(airquality)

vis_miss(airquality)

library(ggplot2)
ggplot(airquality,
       aes(x= Solar.R,
           y= Ozone))+
  geom_point()
## Warning: Removed 42 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggplot(airquality,
       aes(x= Solar.R,
           y= Ozone))+
  geom_point()
## Warning: Removed 42 rows containing missing values or values outside the scale range
## (`geom_point()`).

library(naniar)
## Warning: package 'naniar' was built under R version 4.5.1
ggplot(airquality,
       aes(x= Solar.R,
           y= Ozone))+
  geom_miss_point()

ggplot(airquality,
       aes(x= Solar.R,
           y= Ozone))+
  geom_miss_point() +
  facet_wrap(~Month)

ggplot(airquality,
       aes(x= Solar.R,
           y= Ozone))+
  geom_miss_point() +
  facet_wrap(~Month) + 
  theme_dark()

gg_miss_var(airquality)

gg_miss_var(airquality) + theme_bw()

gg_miss_var(airquality) + labs(y= "look at all the missing ones")

gg_miss_var(airquality, facet = Month)

as_shadow(airquality)
## # A tibble: 153 × 6
##    Ozone_NA Solar.R_NA Wind_NA Temp_NA Month_NA Day_NA
##    <fct>    <fct>      <fct>   <fct>   <fct>    <fct> 
##  1 !NA      !NA        !NA     !NA     !NA      !NA   
##  2 !NA      !NA        !NA     !NA     !NA      !NA   
##  3 !NA      !NA        !NA     !NA     !NA      !NA   
##  4 !NA      !NA        !NA     !NA     !NA      !NA   
##  5 NA       NA         !NA     !NA     !NA      !NA   
##  6 !NA      NA         !NA     !NA     !NA      !NA   
##  7 !NA      !NA        !NA     !NA     !NA      !NA   
##  8 !NA      !NA        !NA     !NA     !NA      !NA   
##  9 !NA      !NA        !NA     !NA     !NA      !NA   
## 10 NA       !NA        !NA     !NA     !NA      !NA   
## # ℹ 143 more rows
aq_shadow <- bind_shadow(airquality)
aq_nab <- nabular(airquality)

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
glimpse(aq_shadow)
## Rows: 153
## Columns: 12
## $ Ozone      <int> 41, 36, 12, 18, NA, 28, 23, 19, 8, NA, 7, 16, 11, 14, 18, 1…
## $ Solar.R    <int> 190, 118, 149, 313, NA, NA, 299, 99, 19, 194, NA, 256, 290,…
## $ Wind       <dbl> 7.4, 8.0, 12.6, 11.5, 14.3, 14.9, 8.6, 13.8, 20.1, 8.6, 6.9…
## $ Temp       <int> 67, 72, 74, 62, 56, 66, 65, 59, 61, 69, 74, 69, 66, 68, 58,…
## $ Month      <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,…
## $ Day        <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, …
## $ Ozone_NA   <fct> !NA, !NA, !NA, !NA, NA, !NA, !NA, !NA, !NA, NA, !NA, !NA, !…
## $ Solar.R_NA <fct> !NA, !NA, !NA, !NA, NA, NA, !NA, !NA, !NA, !NA, NA, !NA, !N…
## $ Wind_NA    <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Temp_NA    <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Month_NA   <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Day_NA     <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
glimpse(aq_nab)
## Rows: 153
## Columns: 12
## $ Ozone      <int> 41, 36, 12, 18, NA, 28, 23, 19, 8, NA, 7, 16, 11, 14, 18, 1…
## $ Solar.R    <int> 190, 118, 149, 313, NA, NA, 299, 99, 19, 194, NA, 256, 290,…
## $ Wind       <dbl> 7.4, 8.0, 12.6, 11.5, 14.3, 14.9, 8.6, 13.8, 20.1, 8.6, 6.9…
## $ Temp       <int> 67, 72, 74, 62, 56, 66, 65, 59, 61, 69, 74, 69, 66, 68, 58,…
## $ Month      <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,…
## $ Day        <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, …
## $ Ozone_NA   <fct> !NA, !NA, !NA, !NA, NA, !NA, !NA, !NA, !NA, NA, !NA, !NA, !…
## $ Solar.R_NA <fct> !NA, !NA, !NA, !NA, NA, NA, !NA, !NA, !NA, !NA, NA, !NA, !N…
## $ Wind_NA    <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Temp_NA    <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Month_NA   <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Day_NA     <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
all.equal(aq_shadow, aq_nab)
## [1] TRUE
airquality%>%
  bind_shadow()%>%
  group_by(Ozone_NA)%>%
  summarise_at(.vars = "Solar.R",
               .funs = c("mean", "sd", "var", "min", "max"),
               na.rm = TRUE)
## # A tibble: 2 × 6
##   Ozone_NA  mean    sd   var   min   max
##   <fct>    <dbl> <dbl> <dbl> <int> <int>
## 1 !NA       185.  91.2 8309.     7   334
## 2 NA        190.  87.7 7690.    31   332
ggplot(aq_shadow,
       aes(x = Temp,
           colour = Ozone_NA)) + 
  geom_density()

oceanbuoys %>%
  bind_shadow()%>%
  ggplot(aes(x= air_temp_c,
             fill = humidity_NA)) + 
    geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 81 rows containing non-finite outside the scale range
## (`stat_bin()`).

oceanbuoys %>%
  bind_shadow()%>%
  ggplot(aes(x= humidity,
             fill = air_temp_c_NA)) + 
    geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 93 rows containing non-finite outside the scale range
## (`stat_bin()`).

library (simputation)
## Warning: package 'simputation' was built under R version 4.5.1
## 
## Attaching package: 'simputation'
## The following object is masked from 'package:naniar':
## 
##     impute_median
library(dplyr)

airquality %>%
  impute_lm(Ozone ~ Temp + Wind) %>%
  ggplot(aes(x= Temp,
             y= Ozone)) + 
  geom_point()

aq_shadow %>%
  as.data.frame() %>%
  impute_lm(Ozone ~ Temp + Wind) %>%
  ggplot(aes(x= Temp,
             y = Ozone,
             colour= Ozone_NA)) + 
  geom_point()

dplyr::n_distinct(airquality)
## [1] 153
dplyr::n_distinct(airquality$Ozone)
## [1] 68
n_miss(airquality)
## [1] 44
n_miss(airquality$Ozone)
## [1] 37
n_complete(airquality)
## [1] 874
n_complete(airquality$Ozone)
## [1] 116
prop_miss_case(airquality)
## [1] 0.2745098
pct_miss_case(airquality)
## [1] 27.45098
miss_case_summary(airquality)
## # A tibble: 153 × 3
##     case n_miss pct_miss
##    <int>  <int>    <dbl>
##  1     5      2     33.3
##  2    27      2     33.3
##  3     6      1     16.7
##  4    10      1     16.7
##  5    11      1     16.7
##  6    25      1     16.7
##  7    26      1     16.7
##  8    32      1     16.7
##  9    33      1     16.7
## 10    34      1     16.7
## # ℹ 143 more rows
miss_case_table(airquality)
## # A tibble: 3 × 3
##   n_miss_in_case n_cases pct_cases
##            <int>   <int>     <dbl>
## 1              0     111     72.5 
## 2              1      40     26.1 
## 3              2       2      1.31
prop_miss_var(airquality)
## [1] 0.3333333
pct_miss_var(airquality)
## [1] 33.33333
miss_var_summary(airquality)
## # A tibble: 6 × 3
##   variable n_miss pct_miss
##   <chr>     <int>    <num>
## 1 Ozone        37    24.2 
## 2 Solar.R       7     4.58
## 3 Wind          0     0   
## 4 Temp          0     0   
## 5 Month         0     0   
## 6 Day           0     0
miss_var_table(airquality)
## # A tibble: 3 × 3
##   n_miss_in_var n_vars pct_vars
##           <int>  <int>    <dbl>
## 1             0      4     66.7
## 2             7      1     16.7
## 3            37      1     16.7
miss_var_run(pedestrian, hourly_counts)
## # A tibble: 35 × 2
##    run_length is_na   
##         <int> <chr>   
##  1       6628 complete
##  2          1 missing 
##  3       5250 complete
##  4        624 missing 
##  5       3652 complete
##  6          1 missing 
##  7       1290 complete
##  8        744 missing 
##  9       7420 complete
## 10          1 missing 
## # ℹ 25 more rows
miss_var_span(pedestrian,
              hourly_counts,
              span_every = 100)
## # A tibble: 377 × 6
##    span_counter n_miss n_complete prop_miss prop_complete n_in_span
##           <int>  <int>      <int>     <dbl>         <dbl>     <int>
##  1            1      0        100         0             1       100
##  2            2      0        100         0             1       100
##  3            3      0        100         0             1       100
##  4            4      0        100         0             1       100
##  5            5      0        100         0             1       100
##  6            6      0        100         0             1       100
##  7            7      0        100         0             1       100
##  8            8      0        100         0             1       100
##  9            9      0        100         0             1       100
## 10           10      0        100         0             1       100
## # ℹ 367 more rows
pedestrian %>% miss_var_summary()
## # A tibble: 9 × 3
##   variable      n_miss pct_miss
##   <chr>          <int>    <num>
## 1 hourly_counts   2548     6.76
## 2 date_time          0     0   
## 3 year               0     0   
## 4 month              0     0   
## 5 month_day          0     0   
## 6 week_day           0     0   
## 7 hour               0     0   
## 8 sensor_id          0     0   
## 9 sensor_name        0     0
pedestrian %>%
  group_by(month) %>%
  miss_var_summary() %>%
  filter(variable == "hourly_counts")
## # A tibble: 12 × 4
## # Groups:   month [12]
##    month     variable      n_miss pct_miss
##    <ord>     <chr>          <int>    <num>
##  1 January   hourly_counts      0     0   
##  2 February  hourly_counts      0     0   
##  3 March     hourly_counts      0     0   
##  4 April     hourly_counts    552    19.2 
##  5 May       hourly_counts     72     2.42
##  6 June      hourly_counts      0     0   
##  7 July      hourly_counts      0     0   
##  8 August    hourly_counts    408    13.7 
##  9 September hourly_counts      0     0   
## 10 October   hourly_counts    412     7.44
## 11 November  hourly_counts    888    30.8 
## 12 December  hourly_counts    216     7.26
airquality %>%
  add_prop_miss() %>%
  head()
##   Ozone Solar.R Wind Temp Month Day prop_miss_all
## 1    41     190  7.4   67     5   1     0.0000000
## 2    36     118  8.0   72     5   2     0.0000000
## 3    12     149 12.6   74     5   3     0.0000000
## 4    18     313 11.5   62     5   4     0.0000000
## 5    NA      NA 14.3   56     5   5     0.3333333
## 6    28      NA 14.9   66     5   6     0.1666667
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.5.1
airquality %>%
  add_prop_miss() %>%
  rpart(prop_miss_all ~ ., data = .) %>%
  prp(type = 4, extra = 101, prefix = "prop.Miss = ")
## Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
## To silence this warning:
##     Call prp with roundint=FALSE,
##     or rebuild the rpart model with model=TRUE.