Looking at The Data
A Few Key Verbs
- filter
- select
Histogram of Last Night’s Sleep

knitr::opts_chunk$set(comment=NA)
options(width = 70)

## add additional libraries/packages here, as needed
## leaving the tidyverse as the last package loaded
library(tidyverse)

Loading in the data …

## if you want to load in a data set called namebeta.csv
## and then create a tibble from it called namealpha
## then uncomment the next line by removing the #

day1 <- read_csv("surveyday1_2020.csv")

Parsed with column specification:
cols(
  .default = col_double(),
  sex = col_character(),
  glasses = col_character(),
  english = col_character(),
  favcolor = col_character()
)

See spec(...) for full column specifications.

Looking at The Data

day1

# A tibble: 382 x 21
   student sex   glasses english statsofar ageguess smoke h.left
     <dbl> <chr> <chr>   <chr>       <dbl>    <dbl> <dbl>  <dbl>
 1  202001 <NA>  y       n              NA       NA     1     NA
 2  202002 <NA>  y       y              NA       NA     1     NA
 3  202003 <NA>  y       y              NA       NA     1     NA
 4  202004 <NA>  y       n              NA       NA     1     NA
 5  202005 <NA>  y       y              NA       NA     1     NA
 6  202006 <NA>  n       y              NA       NA     1     NA
 7  202007 <NA>  y       y              NA       NA     1     NA
 8  202008 <NA>  n       y              NA       NA     1     NA
 9  202009 <NA>  y       n              NA       NA     1     NA
10  202010 <NA>  y       n              NA       NA     1     NA
# ... with 372 more rows, and 13 more variables: h.right <dbl>,
#   handedness <dbl>, statfuture <dbl>, haircut <dbl>, lecture <dbl>,
#   alone <dbl>, height.in <dbl>, hand.span <dbl>, favcolor <chr>,
#   lastsleep <dbl>, pulse <dbl>, year <dbl>, lovetrueage <dbl>

Variable Names

names(day1)

 [1] "student"     "sex"         "glasses"     "english"    
 [5] "statsofar"   "ageguess"    "smoke"       "h.left"     
 [9] "h.right"     "handedness"  "statfuture"  "haircut"    
[13] "lecture"     "alone"       "height.in"   "hand.span"  
[17] "favcolor"    "lastsleep"   "pulse"       "year"       
[21] "lovetrueage"

Summary of the variables

summary(day1)

    student           sex              glasses         
 Min.   :201401   Length:382         Length:382        
 1st Qu.:201605   Class :character   Class :character  
 Median :201737   Mode  :character   Mode  :character  
 Mean   :201751                                        
 3rd Qu.:201933                                        
 Max.   :202067                                        
                                                       
   english            statsofar        ageguess        smoke      
 Length:382         Min.   :1.000   Min.   :21.0   Min.   :1.000  
 Class :character   1st Qu.:4.500   1st Qu.:45.0   1st Qu.:1.000  
 Mode  :character   Median :5.000   Median :48.0   Median :1.000  
                    Mean   :5.073   Mean   :47.3   Mean   :1.068  
                    3rd Qu.:6.000   3rd Qu.:52.0   3rd Qu.:1.000  
                    Max.   :7.000   Max.   :70.0   Max.   :3.000  
                    NA's   :67      NA's   :73     NA's   :2      
     h.left          h.right     handedness        statfuture   
 Min.   : 0.000   Min.   : 0   Min.   :-1.0000   Min.   :3.000  
 1st Qu.: 0.000   1st Qu.:10   1st Qu.: 0.5000   1st Qu.:6.000  
 Median : 2.000   Median :14   Median : 0.8000   Median :7.000  
 Mean   : 3.297   Mean   :13   Mean   : 0.6208   Mean   :6.368  
 3rd Qu.: 4.000   3rd Qu.:17   3rd Qu.: 1.0000   3rd Qu.:7.000  
 Max.   :20.000   Max.   :20   Max.   : 1.0000   Max.   :7.000  
 NA's   :69       NA's   :69   NA's   :69        NA's   :2      
    haircut          lecture          alone         height.in    
 Min.   :  0.00   Min.   :1.000   Min.   :1.000   Min.   :57.00  
 1st Qu.: 12.00   1st Qu.:2.000   1st Qu.:2.000   1st Qu.:64.00  
 Median : 20.00   Median :3.000   Median :3.000   Median :67.00  
 Mean   : 27.28   Mean   :2.892   Mean   :2.976   Mean   :67.12  
 3rd Qu.: 35.00   3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:70.00  
 Max.   :210.00   Max.   :5.000   Max.   :5.000   Max.   :77.50  
 NA's   :6        NA's   :2       NA's   :3       NA's   :4      
   hand.span       favcolor           lastsleep          pulse       
 Min.   : 8.00   Length:382         Min.   : 2.000   Min.   : 30.00  
 1st Qu.:19.00   Class :character   1st Qu.: 6.000   1st Qu.: 64.00  
 Median :20.00   Mode  :character   Median : 7.000   Median : 72.00  
 Mean   :19.94                      Mean   : 6.907   Mean   : 72.96  
 3rd Qu.:21.70                      3rd Qu.: 8.000   3rd Qu.: 80.00  
 Max.   :27.00                      Max.   :12.000   Max.   :110.00  
 NA's   :72                         NA's   :3        NA's   :69      
      year       lovetrueage   
 Min.   :2014   Min.   :47.50  
 1st Qu.:2016   1st Qu.:49.50  
 Median :2017   Median :50.50  
 Mean   :2017   Mean   :50.73  
 3rd Qu.:2019   3rd Qu.:52.50  
 Max.   :2020   Max.   :53.50

Table of glasses and english

table(day1$glasses, day1$english)

Table of english and sex

table(day1$english, day1$sex)

library(magrittr)


Attaching package: 'magrittr'

The following object is masked from 'package:purrr':

    set_names

The following object is masked from 'package:tidyr':

    extract

day1 %$% table(glasses, english)

       english
glasses  n  y
      n 10 27
      y 16 74

library(janitor)


Attaching package: 'janitor'

The following objects are masked from 'package:stats':

    chisq.test, fisher.test

day1 %>% tabyl(glasses, english)

 glasses  n   y NA_
       n 10  27   0
       y 16  74   1
    <NA> 47 205   2

A Few Key Verbs

filter

filter picks out rows in the data frame

day1

# A tibble: 382 x 21
   student sex   glasses english statsofar ageguess smoke h.left
     <dbl> <chr> <chr>   <chr>       <dbl>    <dbl> <dbl>  <dbl>
 1  202001 <NA>  y       n              NA       NA     1     NA
 2  202002 <NA>  y       y              NA       NA     1     NA
 3  202003 <NA>  y       y              NA       NA     1     NA
 4  202004 <NA>  y       n              NA       NA     1     NA
 5  202005 <NA>  y       y              NA       NA     1     NA
 6  202006 <NA>  n       y              NA       NA     1     NA
 7  202007 <NA>  y       y              NA       NA     1     NA
 8  202008 <NA>  n       y              NA       NA     1     NA
 9  202009 <NA>  y       n              NA       NA     1     NA
10  202010 <NA>  y       n              NA       NA     1     NA
# ... with 372 more rows, and 13 more variables: h.right <dbl>,
#   handedness <dbl>, statfuture <dbl>, haircut <dbl>, lecture <dbl>,
#   alone <dbl>, height.in <dbl>, hand.span <dbl>, favcolor <chr>,
#   lastsleep <dbl>, pulse <dbl>, year <dbl>, lovetrueage <dbl>

day1 %>% filter(glasses == "y")

# A tibble: 91 x 21
   student sex   glasses english statsofar ageguess smoke h.left
     <dbl> <chr> <chr>   <chr>       <dbl>    <dbl> <dbl>  <dbl>
 1  202001 <NA>  y       n              NA       NA     1     NA
 2  202002 <NA>  y       y              NA       NA     1     NA
 3  202003 <NA>  y       y              NA       NA     1     NA
 4  202004 <NA>  y       n              NA       NA     1     NA
 5  202005 <NA>  y       y              NA       NA     1     NA
 6  202007 <NA>  y       y              NA       NA     1     NA
 7  202009 <NA>  y       n              NA       NA     1     NA
 8  202010 <NA>  y       n              NA       NA     1     NA
 9  202011 <NA>  y       y              NA       NA     1     NA
10  202012 <NA>  y       y              NA       NA     1     NA
# ... with 81 more rows, and 13 more variables: h.right <dbl>,
#   handedness <dbl>, statfuture <dbl>, haircut <dbl>, lecture <dbl>,
#   alone <dbl>, height.in <dbl>, hand.span <dbl>, favcolor <chr>,
#   lastsleep <dbl>, pulse <dbl>, year <dbl>, lovetrueage <dbl>

day1 %>% filter(glasses == "y", english == "n")

# A tibble: 16 x 21
   student sex   glasses english statsofar ageguess smoke h.left
     <dbl> <chr> <chr>   <chr>       <dbl>    <dbl> <dbl>  <dbl>
 1  202001 <NA>  y       n              NA       NA     1     NA
 2  202004 <NA>  y       n              NA       NA     1     NA
 3  202009 <NA>  y       n              NA       NA     1     NA
 4  202010 <NA>  y       n              NA       NA     1     NA
 5  202029 <NA>  y       n              NA       NA     1     NA
 6  202031 <NA>  y       n              NA       NA     1     NA
 7  202035 <NA>  y       n              NA       NA     1     NA
 8  202054 <NA>  y       n              NA       NA     1     NA
 9  202058 <NA>  y       n              NA       NA     1     NA
10  202066 <NA>  y       n              NA       NA     1     NA
11  201918 <NA>  y       n               4       42     1      2
12  201919 <NA>  y       n               7       50     1      2
13  201931 <NA>  y       n               4       45     2      3
14  201932 <NA>  y       n               6       40     1      0
15  201951 <NA>  y       n               3       45     1      0
16  201954 <NA>  y       n               5       55     1      0
# ... with 13 more variables: h.right <dbl>, handedness <dbl>,
#   statfuture <dbl>, haircut <dbl>, lecture <dbl>, alone <dbl>,
#   height.in <dbl>, hand.span <dbl>, favcolor <chr>,
#   lastsleep <dbl>, pulse <dbl>, year <dbl>, lovetrueage <dbl>

day1 %>% filter(glasses == "y", favcolor == "blue") %>%
    count()

# A tibble: 1 x 1
      n
  <int>
1    27

day1 %>% filter(glasses == "y", favcolor == "blue") %>%
    count(year)

# A tibble: 2 x 2
   year     n
  <dbl> <int>
1  2019    13
2  2020    14

day1 %>% filter(english == "y", favcolor == "blue") %>%
    count(year)

# A tibble: 7 x 2
   year     n
  <dbl> <int>
1  2014    12
2  2015    16
3  2016    23
4  2017    14
5  2018    14
6  2019    18
7  2020    17

select

select is used to pick out columns (variables) that we want to use

day1 %>% select(smoke, favcolor, lastsleep)

# A tibble: 382 x 3
   smoke favcolor lastsleep
   <dbl> <chr>        <dbl>
 1     1 blue           7  
 2     1 blue           6.5
 3     1 purple         8  
 4     1 blue           7  
 5     1 purple         7  
 6     1 silver         8  
 7     1 green          6  
 8     1 blue           5  
 9     1 purple         7  
10     1 green          6.5
# ... with 372 more rows

Three key verbs so far: filter, select, count

More to come.

Histogram of Last Night’s Sleep

I build plots using the ggplot2 package, which is part of the tidyverse. ggplot2 has a function called ggplot().

ggplot(data = day1, aes(x = lastsleep)) +
    geom_histogram()

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Warning: Removed 3 rows containing non-finite values (stat_bin).

Improve the Histogram

ggplot(data = day1, aes(x = lastsleep)) +
    geom_histogram(binwidth = 1, fill = "royalblue", col = "yellow") +
    labs(title = "Histogram of Sleep for 431 Students")

Warning: Removed 3 rows containing non-finite values (stat_bin).

Boxplot of Sleep by English

ggplot(data = day1, aes(x = english, y = lastsleep)) +
    geom_boxplot()

Warning: Removed 3 rows containing non-finite values (stat_boxplot).

Get rid of the missing values…

day1 %>% 
    filter(complete.cases(english, lastsleep)) %>%
    ggplot(data = ., aes(x = english, y = lastsleep)) +
    geom_boxplot()

Faceted Histogram of Sleep by English

ggplot(data = day1, aes(x = lastsleep)) +
    geom_histogram(binwidth = 1) +
    facet_wrap(~ english)

Warning: Removed 3 rows containing non-finite values (stat_bin).

Faceted Histogram of Sleep by Year

ggplot(data = day1, aes(x = lastsleep)) +
    geom_histogram(binwidth = 1) +
    facet_wrap(~ year)

Warning: Removed 3 rows containing non-finite values (stat_bin).

Boxplot of Sleep by Year

ggplot(data = day1, aes(x = factor(year), y = lastsleep)) +
    geom_boxplot()

Warning: Removed 3 rows containing non-finite values (stat_boxplot).

Scatterplot of Sleep by Height

ggplot(data = day1, aes(x = height.in, y = lastsleep)) +
    geom_point()

Warning: Removed 5 rows containing missing values (geom_point).

Add linear smooth and jitter points a little

ggplot(data = day1, aes(x = height.in, y = lastsleep)) +
    geom_jitter(pch = 1) +
    geom_smooth(method = "lm") +
    labs(title = "Sleep and Height in 431 students",
         subtitle = "jittered points, with Linear fit")

`geom_smooth()` using formula 'y ~ x'

Warning: Removed 5 rows containing non-finite values (stat_smooth).

Warning: Removed 5 rows containing missing values (geom_point).

Add loess smooth

ggplot(data = day1, aes(x = height.in, y = lastsleep)) +
    geom_point(pch = 1) +
    geom_smooth(method = "loess") +
    labs(x = "New title for X axis")

`geom_smooth()` using formula 'y ~ x'

Warning: Removed 5 rows containing non-finite values (stat_smooth).

Warning: Removed 5 rows containing missing values (geom_point).

Live Demo for Class 03

Thomas E. Love

2020-09-01