knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
## -- Attaching packages -------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1 v purrr 0.2.4
## v tibble 1.4.1 v dplyr 0.7.4
## v tidyr 0.7.2 v stringr 1.2.0
## v readr 1.1.1 v forcats 0.2.0
## -- Conflicts ----------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr); library(ggplot2); library(tibble); library(readr);
KimData will probably be a tibble, but it would be fine if it’s a data.frame. Soon, we’ll talk about the difference between the two, and why we generally prefer tibbles. You can tell which it is by just entering the dataset name and looking at the output. Tibble output is much nicer.
Clean_KimData <- read_csv("C:/LocalFiles/Documents/Freshman TSU/STAT-220/Lab 2/Clean-KimData.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## Semester = col_integer(),
## Gender = col_character(),
## `Birth Order` = col_character(),
## `dog vs. cat` = col_character(),
## Handed = col_character(),
## `On/Off Campus` = col_character(),
## `Calories per day` = col_integer(),
## `Politically Liberal` = col_integer(),
## `Religiously C or L` = col_integer(),
## `Socially C or L` = col_integer(),
## Phone = col_character()
## )
## See spec(...) for full column specifications.
write.csv(Clean_KimData, file="CleanKimData")
KimData <- Clean_KimData
View(KimData)
summary(KimData)
## Semester Gender Siblings Birth Order
## Min. : 0.000 Length:377 Min. : 0.000 Length:377
## 1st Qu.: 1.000 Class :character 1st Qu.: 1.000 Class :character
## Median : 2.000 Mode :character Median : 2.000 Mode :character
## Mean : 2.729 Mean : 1.831
## 3rd Qu.: 4.000 3rd Qu.: 2.000
## Max. :10.000 Max. :10.000
## NA's :4
## Shoe Size Height Weight dog vs. cat
## Min. : 5.000 Min. :45.0 Min. : 0.0 Length:377
## 1st Qu.: 7.500 1st Qu.:64.0 1st Qu.:127.0 Class :character
## Median : 9.000 Median :67.0 Median :146.0 Mode :character
## Mean : 9.498 Mean :66.8 Mean :152.2
## 3rd Qu.: 10.500 3rd Qu.:70.0 3rd Qu.:175.0
## Max. :113.000 Max. :77.0 Max. :300.0
## NA's :1 NA's :17 NA's :11
## Handed On/Off Campus Calories per day Servings of Fruit
## Length:377 Length:377 Min. : 500 Min. :0.000
## Class :character Class :character 1st Qu.:1750 1st Qu.:1.000
## Mode :character Mode :character Median :2000 Median :2.000
## Mean :2125 Mean :1.765
## 3rd Qu.:2500 3rd Qu.:2.000
## Max. :6000 Max. :7.000
## NA's :64 NA's :23
## Cups of Water Cups of Coffee Hours of Sleep
## Min. : 0.000 Min. :0.0000 Min. :4.000
## 1st Qu.: 3.625 1st Qu.:0.0000 1st Qu.:6.500
## Median : 5.000 Median :0.5000 Median :7.000
## Mean : 5.689 Mean :0.8112 Mean :7.053
## 3rd Qu.: 8.000 3rd Qu.:1.0000 3rd Qu.:8.000
## Max. :32.000 Max. :5.0000 Max. :9.000
## NA's :7 NA's :123
## Hours spent studying per week Hours spent working per week
## Min. : 1.00 Min. : 0.000
## 1st Qu.: 7.00 1st Qu.: 2.625
## Median :10.00 Median : 5.000
## Mean :13.14 Mean : 7.401
## 3rd Qu.:18.00 3rd Qu.:10.000
## Max. :50.00 Max. :40.000
## NA's :10 NA's :99
## Hours spent workingout per wee Hours socializing per w
## Min. : 0.000 Min. : 0.00
## 1st Qu.: 2.000 1st Qu.: 6.00
## Median : 4.000 Median : 10.00
## Mean : 4.496 Mean : 13.77
## 3rd Qu.: 6.000 3rd Qu.: 20.00
## Max. :22.000 Max. :100.00
## NA's :10 NA's :23
## Politically Liberal Religiously C or L Socially C or L
## Min. :-2.0000 Min. :-2.0000 Min. :-2.0000
## 1st Qu.:-1.0000 1st Qu.:-1.0000 1st Qu.: 0.0000
## Median : 0.0000 Median : 0.0000 Median : 1.0000
## Mean : 0.1622 Mean : 0.1054 Mean : 0.4595
## 3rd Qu.: 1.0000 3rd Qu.: 1.0000 3rd Qu.: 2.0000
## Max. : 2.0000 Max. : 2.0000 Max. : 2.0000
## NA's :7 NA's :7 NA's :7
## Phone Hrs per day on phone Hrs/day on phone not talking
## Length:377 Min. : 0.000 Min. : 0.000
## Class :character 1st Qu.: 1.000 1st Qu.: 1.000
## Mode :character Median : 2.000 Median : 2.000
## Mean : 2.579 Mean : 2.363
## 3rd Qu.: 3.000 3rd Qu.: 3.000
## Max. :35.000 Max. :18.000
## NA's :212 NA's :213
Filter selects certain observations. So, if we just want those who identified themselves as women in the dataset:
filter(KimData, Gender == "F")
## # A tibble: 214 x 25
## Semester Gender Siblings `Birth Order` `Shoe Size` Height Weight
## <int> <chr> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 6 F 5.00 Middle 11.0 71.0 195
## 2 4 F 0 Only 10.0 64.0 187
## 3 6 F 1.00 Last 9.50 69.0 150
## 4 7 F 3.00 First 9.50 64.0 193
## 5 2 F 3.00 Middle 11.0 69.5 180
## 6 4 F 0 Only 7.00 64.0 135
## 7 4 F 1.00 Last 7.50 65.0 130
## 8 4 F 1.00 Last 6.50 67.0 128
## 9 2 F 2.00 First 8.00 65.0 124
## 10 2 F 3.00 Middle 8.00 65.0 145
## # ... with 204 more rows, and 18 more variables: `dog vs. cat` <chr>,
## # Handed <chr>, `On/Off Campus` <chr>, `Calories per day` <int>,
## # `Servings of Fruit` <dbl>, `Cups of Water` <dbl>, `Cups of Coffee`
## # <dbl>, `Hours of Sleep` <dbl>, `Hours spent studying per week` <dbl>,
## # `Hours spent working per week` <dbl>, `Hours spent workingout per wee`
## # <dbl>, `Hours socializing per w` <dbl>, `Politically Liberal` <int>,
## # `Religiously C or L` <int>, `Socially C or L` <int>, Phone <chr>, `Hrs
## # per day on phone` <dbl>, `Hrs/day on phone not talking` <dbl>
filter(KimData, Siblings == 0)
## # A tibble: 39 x 25
## Semester Gender Siblings `Birth Order` `Shoe Size` Height Weight
## <int> <chr> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 4 F 0 Only 10.0 64.0 187
## 2 4 F 0 Only 7.00 64.0 135
## 3 10 M 0 Only 13.0 72.0 210
## 4 7 M 0 Only 11.0 73.0 163
## 5 2 M 0 Only 10.0 65.0 130
## 6 4 F 0 Only 7.00 62.0 135
## 7 4 F 0 Only 7.00 64.0 105
## 8 1 M 0 Only 8.00 65.0 135
## 9 3 M 0 Only 9.50 62.0 120
## 10 4 F 0 Only 7.00 63.0 110
## # ... with 29 more rows, and 18 more variables: `dog vs. cat` <chr>,
## # Handed <chr>, `On/Off Campus` <chr>, `Calories per day` <int>,
## # `Servings of Fruit` <dbl>, `Cups of Water` <dbl>, `Cups of Coffee`
## # <dbl>, `Hours of Sleep` <dbl>, `Hours spent studying per week` <dbl>,
## # `Hours spent working per week` <dbl>, `Hours spent workingout per wee`
## # <dbl>, `Hours socializing per w` <dbl>, `Politically Liberal` <int>,
## # `Religiously C or L` <int>, `Socially C or L` <int>, Phone <chr>, `Hrs
## # per day on phone` <dbl>, `Hrs/day on phone not talking` <dbl>
Notice that this has n=162 observations, not 763
StinkyBoys <- filter(KimData, Gender == "M")
View(StinkyBoys)
We can also use fancier mathematical “relational operators.” Notice that this has n=103 observations, all with semester 4 or higher. ?== gives you a large list of relational operators. “!=” means “not equal to”
UpperClass <- filter(KimData, Semester > 3)
View(UpperClass)
You can use the | character to mean “or” Notice that you have to write Semester twice– you can’t do “Semester ==1 | 0” You can use the & character to mean “and” (although not for a single variable - that’s OR) ?& #gives you a large list of logical operators.
Freshlings <- filter(KimData, Semester == 1 | Semester ==0)
Freshlings <- filter(KimData, Semester ==1 & Gender == "M") #find freshman males
View(Freshlings)
Arrange sorts your data into a certain order. So, if we want the data in order from newest at Truman to senior-est. Missing values and NAs go at the end.
FreshFirst <- arrange(KimData, Semester)
Select selects by variable (kind of the transpose of filter, which works on rows, not columns). This is especially helpful for super-giant datasets, so you can make something smaller to work with. If we just want the first few physical variables, we can list them. Those weird quotes (upper left of the keyboard, same as RMarkdown) are needed for spaces, etc.
KimDataPhysical <- select(KimData, Semester, Gender, `Shoe Size`, Height, Weight, Handed)
View(KimDataPhysical)
If we just want the numerical variables, we can go through and note their column numbers Rename is a variation of select that simply changes the name of a variable. You can see how the name changes in the environment window. It would be annoying, but possible, to rename variables with the select command KimDataPhysical <- select(KimDataPhysical, Shoe.Size = Shoe Size, everything) This works, but moves Shoe.Size to the first column
KimNumData <- select(KimData, c(1,3,5:7, 12, 13:22))
View(KimNumData)
KimDataP2 <- rename(KimDataPhysical, Shoe.Size = `Shoe Size`)
Mutate creates a new variable that is a function using existing ones This is especially helpful for data cleaning, or changing units, data types or whatever.
KimDataP3 <- mutate(KimDataPhysical, Heightcm = round(Height/2.54,0))
KimDataP4 <- mutate(KimDataPhysical, Gender=as.factor(Gender))
If you look at Shoe.Size, you can see one outlier.
summary(KimDataPhysical)
## Semester Gender Shoe Size Height
## Min. : 0.000 Length:377 Min. : 5.000 Min. :45.0
## 1st Qu.: 1.000 Class :character 1st Qu.: 7.500 1st Qu.:64.0
## Median : 2.000 Mode :character Median : 9.000 Median :67.0
## Mean : 2.729 Mean : 9.498 Mean :66.8
## 3rd Qu.: 4.000 3rd Qu.: 10.500 3rd Qu.:70.0
## Max. :10.000 Max. :113.000 Max. :77.0
## NA's :1 NA's :17
## Weight Handed
## Min. : 0.0 Length:377
## 1st Qu.:127.0 Class :character
## Median :146.0 Mode :character
## Mean :152.2
## 3rd Qu.:175.0
## Max. :300.0
## NA's :11
One guy has a size 113 shoe. It should be size 13. sub is the command in base R used for substituting one value for another. sub converts a variable to a character string, so we have to turn it back into a number.
KimDataP5<- mutate(KimDataP2, Shoe.Size=as.numeric(sub(113, 13, Shoe.Size)))
summary(KimDataP5$Shoe.Size)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 5.000 7.500 9.000 9.232 10.500 16.000 1
That’s better. 16 is still big, but realistic. Mutate can get tricky pretty quickly. Maybe we want to convert Gender into a factor, and turn the missing one into an NA.
KimDataP4<- mutate(KimDataPhysical, Gender=as.factor(sub("other", NA, Gender)))
The book has a list of helpful commands to include in mutate functions http://r4ds.had.co.nz/transform.html#mutate-funs
We made new datasets for each of these, but often you would just stack the changes at once. The tidyr package (coming soon) allows multiple changes and other tricks for data cleaning.
By itself, summarize is a pokey way to do simple computations.
summarise(KimData, MeanWeight = mean(Weight, na.rm = TRUE))
## # A tibble: 1 x 1
## MeanWeight
## <dbl>
## 1 152
summarise(KimData, MeanWeight = mean(Weight))
## # A tibble: 1 x 1
## MeanWeight
## <dbl>
## 1 NA
With group_by, it becomes a magical machine
by_gender <- group_by(KimData, Gender)
View(by_gender)
It doesn’t actually do anything to the data itself.
by_gender
## # A tibble: 377 x 25
## # Groups: Gender [3]
## Semester Gender Siblings `Birth Order` `Shoe Size` Height Weight
## <int> <chr> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 6 F 5.00 Middle 11.0 71.0 195
## 2 4 F 0 Only 10.0 64.0 187
## 3 6 F 1.00 Last 9.50 69.0 150
## 4 7 F 3.00 First 9.50 64.0 193
## 5 6 M 2.00 Middle 13.0 73.0 181
## 6 6 M 2.00 First 10.0 68.0 167
## 7 6 M 3.00 First 13.0 73.0 190
## 8 9 M 1.00 Last 12.0 74.0 195
## 9 2 F 3.00 Middle 11.0 69.5 180
## 10 4 M 3.00 Last 11.0 72.5 175
## # ... with 367 more rows, and 18 more variables: `dog vs. cat` <chr>,
## # Handed <chr>, `On/Off Campus` <chr>, `Calories per day` <int>,
## # `Servings of Fruit` <dbl>, `Cups of Water` <dbl>, `Cups of Coffee`
## # <dbl>, `Hours of Sleep` <dbl>, `Hours spent studying per week` <dbl>,
## # `Hours spent working per week` <dbl>, `Hours spent workingout per wee`
## # <dbl>, `Hours socializing per w` <dbl>, `Politically Liberal` <int>,
## # `Religiously C or L` <int>, `Socially C or L` <int>, Phone <chr>, `Hrs
## # per day on phone` <dbl>, `Hrs/day on phone not talking` <dbl>
If it’s a tibble, it starts by telling you about the groups.
summarise(by_gender, MeanWeight = mean(Weight, na.rm = TRUE))
## # A tibble: 3 x 2
## Gender MeanWeight
## <chr> <dbl>
## 1 F 136
## 2 M 173
## 3 other 115
Snazzy! Do it again, this time by semester.
by_sem <- group_by(KimData, Semester)
summarise(by_sem, MeanWeight = mean(Weight, na.rm = TRUE))
## # A tibble: 11 x 2
## Semester MeanWeight
## <int> <dbl>
## 1 0 155
## 2 1 149
## 3 2 149
## 4 3 161
## 5 4 146
## 6 5 154
## 7 6 156
## 8 7 171
## 9 8 144
## 10 9 195
## 11 10 210
Now, together.
by_Gsem <- group_by(KimData, Gender, Semester)
Gsem_means <- summarise(by_Gsem, MeanWeight = mean(Weight, na.rm = TRUE))
It looks at all of the subgroups. It’s hard to see with that many.
View(Gsem_means)
A pipe is a way to connect multiple lines of code. It basically means, “take the result of this line down to the next line.” ggplot uses + as a pipe. That’s cool, but un-tidy (because + also means “add these up”) dplyr and most other tidyverse packages use a unique pipe that has no other meaning. %>% No, really, that’s what it looks like. Yes, that’s weird. But, you have to admit that you aren’t going to use %>% for anything else.
tall_Kim <- KimData %>% #This line just renames the dataset
group_by(Gender,Semester) %>% #Now, we don’t need to re-type Kimdata
summarize(count = n(), #this counts up how many in each thing
tall=mean(Height, na.rm=TRUE)) %>% #this finds the average height
filter(count > 2) %>% #this gets rid of low-n categories
arrange(tall) #this sorts them from shortest to tallest
tall_Kim #See what you made?
## # A tibble: 16 x 4
## # Groups: Gender [2]
## Gender Semester count tall
## <chr> <int> <int> <dbl>
## 1 F 1 60 62.0
## 2 F 5 5 62.9
## 3 F 4 30 64.3
## 4 F 7 5 65.0
## 5 F 2 66 65.2
## 6 F 6 11 65.8
## 7 F 3 28 66.5
## 8 F 0 7 66.5
## 9 M 3 25 68.6
## 10 M 2 44 69.5
## 11 M 8 4 69.8
## 12 M 1 41 69.8
## 13 M 4 17 70.5
## 14 M 7 11 71.0
## 15 M 6 9 71.2
## 16 M 5 7 71.4
This script groups individuals by Gender and Semester, counts the number in each cell, then computes the average height. It eliminates low-n cells, then sorts it smallest to largest. That could be handy, right?
Notice that this script is long and wordy, but easy to understand. When you mix dplyr and ggplot, you have to be careful to get the pipes correct. That can be annoying, but you should keep your graphs away from your data management anyway. How about this? A chart of the average height of gender, by semester (excluding small groups).
ggplot(data=tall_Kim, mapping = aes(x=Semester, y=tall, color=Gender)) +
geom_point()
Make a new version of the KimData, named “Righties”
Righties <- KimData
Select the KimPhysical vars discussed above, plus the 3 conservative/liberal ones.
Righties <- select(KimData, Semester, Gender, `Shoe Size`, Height, Weight, Handed, `Politically Liberal`, `Religiously C or L`, `Socially C or L`)
View(Righties)
Create a new variable, BMI, that calculates Body Mass Index from Height and Weight. Google to find that formula. [weight (lb) / height (in) / height (in)] x 703= BMI
BMI<-mutate(Righties,BMI=round(703)*Weight/(Height*Height),2)
Limit your group to only Right-handed people.
Righties <- filter(Righties, Handed=="Right")
Group your data by Semester
Righties <- arrange(Righties, Semester)
Calculate the average BMI by Semester.
handed_Righties<-BMI %>%
filter(Handed=="Right") %>%
group_by(Semester) %>%
summarize(count = n(),
bodyType=mean(BMI, na.rm=TRUE)) %>%
filter(count > 2) %>%
arrange(bodyType)
handed_Righties
## # A tibble: 9 x 3
## Semester count bodyType
## <int> <int> <dbl>
## 1 8 6 22.0
## 2 2 95 22.9
## 3 4 44 22.9
## 4 5 9 23.2
## 5 0 9 23.3
## 6 6 20 23.5
## 7 3 47 25.0
## 8 1 91 25.0
## 9 7 13 26.4
In knitr, what’s the difference between include=FALSE and echo=FALSE? Why could that be important?
echo = FALSE hides the code, but will evaluate it and show its output in the knit. include = FALSE hides the code AND the output from the knit, but will evaluate the code silently.
This is important because sometimes you want to show the output and sometimes you don’t.
Plot #1: Make a scatterplot for all individuals in the full KimData set, x=height, y=BMI
KimDataBMI <- mutate(KimData, BMI = Weight*703/(Height)^2)
ggplot(data = KimDataBMI) +
geom_point(mapping = aes(x = Height, y = BMI))
## Warning: Removed 25 rows containing missing values (geom_point).
Plot #2: Color your scatterplot by Handed.
ggplot(data = KimDataBMI) +
geom_point(mapping = aes(x = Height, y = BMI, color=Handed))
## Warning: Removed 25 rows containing missing values (geom_point).
Plot #3: Add smooth trendlines to make it snazzy
ggplot(data = KimDataBMI) +
geom_jitter(mapping = aes(x = Height, y = BMI, color=Handed)) +
geom_smooth(mapping = aes (x = Height, y = BMI, color = Handed))
## `geom_smooth()` using method = 'loess'
## Warning: Removed 25 rows containing non-finite values (stat_smooth).
## Warning: Removed 25 rows containing missing values (geom_point).
From this plot, I can tell who is right and left handed and how their heights and BMIs relate. From the curve, I see that left-handed people have a higher BMI when shorter and right handed people have a lower BMI when taller. Also, there is only one ambidextrous person.