knitr::opts_chunk$set(comment=NA)
options(width = 70)
## add additional libraries/packages here, as needed
## leaving the tidyverse as the last package loaded
library(tidyverse)
Loading in the data …
## if you want to load in a data set called namebeta.csv
## and then create a tibble from it called namealpha
## then uncomment the next line by removing the #
day1 <- read_csv("surveyday1_2020.csv")
Parsed with column specification:
cols(
.default = col_double(),
sex = col_character(),
glasses = col_character(),
english = col_character(),
favcolor = col_character()
)
See spec(...) for full column specifications.
day1
# A tibble: 382 x 21
student sex glasses english statsofar ageguess smoke h.left
<dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 202001 <NA> y n NA NA 1 NA
2 202002 <NA> y y NA NA 1 NA
3 202003 <NA> y y NA NA 1 NA
4 202004 <NA> y n NA NA 1 NA
5 202005 <NA> y y NA NA 1 NA
6 202006 <NA> n y NA NA 1 NA
7 202007 <NA> y y NA NA 1 NA
8 202008 <NA> n y NA NA 1 NA
9 202009 <NA> y n NA NA 1 NA
10 202010 <NA> y n NA NA 1 NA
# ... with 372 more rows, and 13 more variables: h.right <dbl>,
# handedness <dbl>, statfuture <dbl>, haircut <dbl>, lecture <dbl>,
# alone <dbl>, height.in <dbl>, hand.span <dbl>, favcolor <chr>,
# lastsleep <dbl>, pulse <dbl>, year <dbl>, lovetrueage <dbl>
names(day1)
[1] "student" "sex" "glasses" "english"
[5] "statsofar" "ageguess" "smoke" "h.left"
[9] "h.right" "handedness" "statfuture" "haircut"
[13] "lecture" "alone" "height.in" "hand.span"
[17] "favcolor" "lastsleep" "pulse" "year"
[21] "lovetrueage"
summary(day1)
student sex glasses
Min. :201401 Length:382 Length:382
1st Qu.:201605 Class :character Class :character
Median :201737 Mode :character Mode :character
Mean :201751
3rd Qu.:201933
Max. :202067
english statsofar ageguess smoke
Length:382 Min. :1.000 Min. :21.0 Min. :1.000
Class :character 1st Qu.:4.500 1st Qu.:45.0 1st Qu.:1.000
Mode :character Median :5.000 Median :48.0 Median :1.000
Mean :5.073 Mean :47.3 Mean :1.068
3rd Qu.:6.000 3rd Qu.:52.0 3rd Qu.:1.000
Max. :7.000 Max. :70.0 Max. :3.000
NA's :67 NA's :73 NA's :2
h.left h.right handedness statfuture
Min. : 0.000 Min. : 0 Min. :-1.0000 Min. :3.000
1st Qu.: 0.000 1st Qu.:10 1st Qu.: 0.5000 1st Qu.:6.000
Median : 2.000 Median :14 Median : 0.8000 Median :7.000
Mean : 3.297 Mean :13 Mean : 0.6208 Mean :6.368
3rd Qu.: 4.000 3rd Qu.:17 3rd Qu.: 1.0000 3rd Qu.:7.000
Max. :20.000 Max. :20 Max. : 1.0000 Max. :7.000
NA's :69 NA's :69 NA's :69 NA's :2
haircut lecture alone height.in
Min. : 0.00 Min. :1.000 Min. :1.000 Min. :57.00
1st Qu.: 12.00 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:64.00
Median : 20.00 Median :3.000 Median :3.000 Median :67.00
Mean : 27.28 Mean :2.892 Mean :2.976 Mean :67.12
3rd Qu.: 35.00 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:70.00
Max. :210.00 Max. :5.000 Max. :5.000 Max. :77.50
NA's :6 NA's :2 NA's :3 NA's :4
hand.span favcolor lastsleep pulse
Min. : 8.00 Length:382 Min. : 2.000 Min. : 30.00
1st Qu.:19.00 Class :character 1st Qu.: 6.000 1st Qu.: 64.00
Median :20.00 Mode :character Median : 7.000 Median : 72.00
Mean :19.94 Mean : 6.907 Mean : 72.96
3rd Qu.:21.70 3rd Qu.: 8.000 3rd Qu.: 80.00
Max. :27.00 Max. :12.000 Max. :110.00
NA's :72 NA's :3 NA's :69
year lovetrueage
Min. :2014 Min. :47.50
1st Qu.:2016 1st Qu.:49.50
Median :2017 Median :50.50
Mean :2017 Mean :50.73
3rd Qu.:2019 3rd Qu.:52.50
Max. :2020 Max. :53.50
table(day1$glasses, day1$english)
n y
n 10 27
y 16 74
table(day1$english, day1$sex)
f m
n 21 26
y 102 103
library(magrittr)
Attaching package: 'magrittr'
The following object is masked from 'package:purrr':
set_names
The following object is masked from 'package:tidyr':
extract
day1 %$% table(glasses, english)
english
glasses n y
n 10 27
y 16 74
library(janitor)
Attaching package: 'janitor'
The following objects are masked from 'package:stats':
chisq.test, fisher.test
day1 %>% tabyl(glasses, english)
glasses n y NA_
n 10 27 0
y 16 74 1
<NA> 47 205 2
filter picks out rows in the data frame
day1
# A tibble: 382 x 21
student sex glasses english statsofar ageguess smoke h.left
<dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 202001 <NA> y n NA NA 1 NA
2 202002 <NA> y y NA NA 1 NA
3 202003 <NA> y y NA NA 1 NA
4 202004 <NA> y n NA NA 1 NA
5 202005 <NA> y y NA NA 1 NA
6 202006 <NA> n y NA NA 1 NA
7 202007 <NA> y y NA NA 1 NA
8 202008 <NA> n y NA NA 1 NA
9 202009 <NA> y n NA NA 1 NA
10 202010 <NA> y n NA NA 1 NA
# ... with 372 more rows, and 13 more variables: h.right <dbl>,
# handedness <dbl>, statfuture <dbl>, haircut <dbl>, lecture <dbl>,
# alone <dbl>, height.in <dbl>, hand.span <dbl>, favcolor <chr>,
# lastsleep <dbl>, pulse <dbl>, year <dbl>, lovetrueage <dbl>
day1 %>% filter(glasses == "y")
# A tibble: 91 x 21
student sex glasses english statsofar ageguess smoke h.left
<dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 202001 <NA> y n NA NA 1 NA
2 202002 <NA> y y NA NA 1 NA
3 202003 <NA> y y NA NA 1 NA
4 202004 <NA> y n NA NA 1 NA
5 202005 <NA> y y NA NA 1 NA
6 202007 <NA> y y NA NA 1 NA
7 202009 <NA> y n NA NA 1 NA
8 202010 <NA> y n NA NA 1 NA
9 202011 <NA> y y NA NA 1 NA
10 202012 <NA> y y NA NA 1 NA
# ... with 81 more rows, and 13 more variables: h.right <dbl>,
# handedness <dbl>, statfuture <dbl>, haircut <dbl>, lecture <dbl>,
# alone <dbl>, height.in <dbl>, hand.span <dbl>, favcolor <chr>,
# lastsleep <dbl>, pulse <dbl>, year <dbl>, lovetrueage <dbl>
day1 %>% filter(glasses == "y", english == "n")
# A tibble: 16 x 21
student sex glasses english statsofar ageguess smoke h.left
<dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 202001 <NA> y n NA NA 1 NA
2 202004 <NA> y n NA NA 1 NA
3 202009 <NA> y n NA NA 1 NA
4 202010 <NA> y n NA NA 1 NA
5 202029 <NA> y n NA NA 1 NA
6 202031 <NA> y n NA NA 1 NA
7 202035 <NA> y n NA NA 1 NA
8 202054 <NA> y n NA NA 1 NA
9 202058 <NA> y n NA NA 1 NA
10 202066 <NA> y n NA NA 1 NA
11 201918 <NA> y n 4 42 1 2
12 201919 <NA> y n 7 50 1 2
13 201931 <NA> y n 4 45 2 3
14 201932 <NA> y n 6 40 1 0
15 201951 <NA> y n 3 45 1 0
16 201954 <NA> y n 5 55 1 0
# ... with 13 more variables: h.right <dbl>, handedness <dbl>,
# statfuture <dbl>, haircut <dbl>, lecture <dbl>, alone <dbl>,
# height.in <dbl>, hand.span <dbl>, favcolor <chr>,
# lastsleep <dbl>, pulse <dbl>, year <dbl>, lovetrueage <dbl>
day1 %>% filter(glasses == "y", favcolor == "blue") %>%
count()
# A tibble: 1 x 1
n
<int>
1 27
day1 %>% filter(glasses == "y", favcolor == "blue") %>%
count(year)
# A tibble: 2 x 2
year n
<dbl> <int>
1 2019 13
2 2020 14
day1 %>% filter(english == "y", favcolor == "blue") %>%
count(year)
# A tibble: 7 x 2
year n
<dbl> <int>
1 2014 12
2 2015 16
3 2016 23
4 2017 14
5 2018 14
6 2019 18
7 2020 17
select is used to pick out columns (variables) that we want to use
day1 %>% select(smoke, favcolor, lastsleep)
# A tibble: 382 x 3
smoke favcolor lastsleep
<dbl> <chr> <dbl>
1 1 blue 7
2 1 blue 6.5
3 1 purple 8
4 1 blue 7
5 1 purple 7
6 1 silver 8
7 1 green 6
8 1 blue 5
9 1 purple 7
10 1 green 6.5
# ... with 372 more rows
Three key verbs so far: filter, select, count
More to come.
I build plots using the ggplot2
package, which is part of the tidyverse. ggplot2
has a function called ggplot()
.
ggplot(data = day1, aes(x = lastsleep)) +
geom_histogram()
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Removed 3 rows containing non-finite values (stat_bin).
ggplot(data = day1, aes(x = lastsleep)) +
geom_histogram(binwidth = 1, fill = "royalblue", col = "yellow") +
labs(title = "Histogram of Sleep for 431 Students")
Warning: Removed 3 rows containing non-finite values (stat_bin).
ggplot(data = day1, aes(x = english, y = lastsleep)) +
geom_boxplot()
Warning: Removed 3 rows containing non-finite values (stat_boxplot).
Get rid of the missing values…
day1 %>%
filter(complete.cases(english, lastsleep)) %>%
ggplot(data = ., aes(x = english, y = lastsleep)) +
geom_boxplot()
ggplot(data = day1, aes(x = lastsleep)) +
geom_histogram(binwidth = 1) +
facet_wrap(~ english)
Warning: Removed 3 rows containing non-finite values (stat_bin).
ggplot(data = day1, aes(x = lastsleep)) +
geom_histogram(binwidth = 1) +
facet_wrap(~ year)
Warning: Removed 3 rows containing non-finite values (stat_bin).
ggplot(data = day1, aes(x = factor(year), y = lastsleep)) +
geom_boxplot()
Warning: Removed 3 rows containing non-finite values (stat_boxplot).
ggplot(data = day1, aes(x = height.in, y = lastsleep)) +
geom_point()
Warning: Removed 5 rows containing missing values (geom_point).
ggplot(data = day1, aes(x = height.in, y = lastsleep)) +
geom_jitter(pch = 1) +
geom_smooth(method = "lm") +
labs(title = "Sleep and Height in 431 students",
subtitle = "jittered points, with Linear fit")
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 5 rows containing non-finite values (stat_smooth).
Warning: Removed 5 rows containing missing values (geom_point).
ggplot(data = day1, aes(x = height.in, y = lastsleep)) +
geom_point(pch = 1) +
geom_smooth(method = "loess") +
labs(x = "New title for X axis")
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 5 rows containing non-finite values (stat_smooth).
Warning: Removed 5 rows containing missing values (geom_point).