knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
## -- Attaching packages -------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1     v purrr   0.2.4
## v tibble  1.4.1     v dplyr   0.7.4
## v tidyr   0.7.2     v stringr 1.2.0
## v readr   1.1.1     v forcats 0.2.0
## -- Conflicts ----------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(dplyr); library(ggplot2); library(tibble); library(readr);

Kim Data Set

KimData will probably be a tibble, but it would be fine if it’s a data.frame. Soon, we’ll talk about the difference between the two, and why we generally prefer tibbles. You can tell which it is by just entering the dataset name and looking at the output. Tibble output is much nicer.

Clean_KimData <- read_csv("C:/LocalFiles/Documents/Freshman TSU/STAT-220/Lab 2/Clean-KimData.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Semester = col_integer(),
##   Gender = col_character(),
##   `Birth Order` = col_character(),
##   `dog vs. cat` = col_character(),
##   Handed = col_character(),
##   `On/Off Campus` = col_character(),
##   `Calories per day` = col_integer(),
##   `Politically Liberal` = col_integer(),
##   `Religiously C or L` = col_integer(),
##   `Socially C or L` = col_integer(),
##   Phone = col_character()
## )
## See spec(...) for full column specifications.
write.csv(Clean_KimData, file="CleanKimData") 
KimData <- Clean_KimData
View(KimData)
summary(KimData)
##     Semester         Gender             Siblings      Birth Order       
##  Min.   : 0.000   Length:377         Min.   : 0.000   Length:377        
##  1st Qu.: 1.000   Class :character   1st Qu.: 1.000   Class :character  
##  Median : 2.000   Mode  :character   Median : 2.000   Mode  :character  
##  Mean   : 2.729                      Mean   : 1.831                     
##  3rd Qu.: 4.000                      3rd Qu.: 2.000                     
##  Max.   :10.000                      Max.   :10.000                     
##                                      NA's   :4                          
##    Shoe Size           Height         Weight      dog vs. cat       
##  Min.   :  5.000   Min.   :45.0   Min.   :  0.0   Length:377        
##  1st Qu.:  7.500   1st Qu.:64.0   1st Qu.:127.0   Class :character  
##  Median :  9.000   Median :67.0   Median :146.0   Mode  :character  
##  Mean   :  9.498   Mean   :66.8   Mean   :152.2                     
##  3rd Qu.: 10.500   3rd Qu.:70.0   3rd Qu.:175.0                     
##  Max.   :113.000   Max.   :77.0   Max.   :300.0                     
##  NA's   :1         NA's   :17     NA's   :11                        
##     Handed          On/Off Campus      Calories per day Servings of Fruit
##  Length:377         Length:377         Min.   : 500     Min.   :0.000    
##  Class :character   Class :character   1st Qu.:1750     1st Qu.:1.000    
##  Mode  :character   Mode  :character   Median :2000     Median :2.000    
##                                        Mean   :2125     Mean   :1.765    
##                                        3rd Qu.:2500     3rd Qu.:2.000    
##                                        Max.   :6000     Max.   :7.000    
##                                        NA's   :64       NA's   :23       
##  Cups of Water    Cups of Coffee   Hours of Sleep 
##  Min.   : 0.000   Min.   :0.0000   Min.   :4.000  
##  1st Qu.: 3.625   1st Qu.:0.0000   1st Qu.:6.500  
##  Median : 5.000   Median :0.5000   Median :7.000  
##  Mean   : 5.689   Mean   :0.8112   Mean   :7.053  
##  3rd Qu.: 8.000   3rd Qu.:1.0000   3rd Qu.:8.000  
##  Max.   :32.000   Max.   :5.0000   Max.   :9.000  
##  NA's   :7        NA's   :123                     
##  Hours spent studying per week Hours spent working per week
##  Min.   : 1.00                 Min.   : 0.000              
##  1st Qu.: 7.00                 1st Qu.: 2.625              
##  Median :10.00                 Median : 5.000              
##  Mean   :13.14                 Mean   : 7.401              
##  3rd Qu.:18.00                 3rd Qu.:10.000              
##  Max.   :50.00                 Max.   :40.000              
##  NA's   :10                    NA's   :99                  
##  Hours spent workingout per wee Hours socializing per w
##  Min.   : 0.000                 Min.   :  0.00         
##  1st Qu.: 2.000                 1st Qu.:  6.00         
##  Median : 4.000                 Median : 10.00         
##  Mean   : 4.496                 Mean   : 13.77         
##  3rd Qu.: 6.000                 3rd Qu.: 20.00         
##  Max.   :22.000                 Max.   :100.00         
##  NA's   :10                     NA's   :23             
##  Politically Liberal Religiously C or L Socially C or L  
##  Min.   :-2.0000     Min.   :-2.0000    Min.   :-2.0000  
##  1st Qu.:-1.0000     1st Qu.:-1.0000    1st Qu.: 0.0000  
##  Median : 0.0000     Median : 0.0000    Median : 1.0000  
##  Mean   : 0.1622     Mean   : 0.1054    Mean   : 0.4595  
##  3rd Qu.: 1.0000     3rd Qu.: 1.0000    3rd Qu.: 2.0000  
##  Max.   : 2.0000     Max.   : 2.0000    Max.   : 2.0000  
##  NA's   :7           NA's   :7          NA's   :7        
##     Phone           Hrs per day on phone Hrs/day on phone not talking
##  Length:377         Min.   : 0.000       Min.   : 0.000              
##  Class :character   1st Qu.: 1.000       1st Qu.: 1.000              
##  Mode  :character   Median : 2.000       Median : 2.000              
##                     Mean   : 2.579       Mean   : 2.363              
##                     3rd Qu.: 3.000       3rd Qu.: 3.000              
##                     Max.   :35.000       Max.   :18.000              
##                     NA's   :212          NA's   :213

1. Filter

Filter selects certain observations. So, if we just want those who identified themselves as women in the dataset:

filter(KimData, Gender == "F")
## # A tibble: 214 x 25
##    Semester Gender Siblings `Birth Order` `Shoe Size` Height Weight
##       <int> <chr>     <dbl> <chr>               <dbl>  <dbl>  <dbl>
##  1        6 F          5.00 Middle              11.0    71.0    195
##  2        4 F          0    Only                10.0    64.0    187
##  3        6 F          1.00 Last                 9.50   69.0    150
##  4        7 F          3.00 First                9.50   64.0    193
##  5        2 F          3.00 Middle              11.0    69.5    180
##  6        4 F          0    Only                 7.00   64.0    135
##  7        4 F          1.00 Last                 7.50   65.0    130
##  8        4 F          1.00 Last                 6.50   67.0    128
##  9        2 F          2.00 First                8.00   65.0    124
## 10        2 F          3.00 Middle               8.00   65.0    145
## # ... with 204 more rows, and 18 more variables: `dog vs. cat` <chr>,
## #   Handed <chr>, `On/Off Campus` <chr>, `Calories per day` <int>,
## #   `Servings of Fruit` <dbl>, `Cups of Water` <dbl>, `Cups of Coffee`
## #   <dbl>, `Hours of Sleep` <dbl>, `Hours spent studying per week` <dbl>,
## #   `Hours spent working per week` <dbl>, `Hours spent workingout per wee`
## #   <dbl>, `Hours socializing per w` <dbl>, `Politically Liberal` <int>,
## #   `Religiously C or L` <int>, `Socially C or L` <int>, Phone <chr>, `Hrs
## #   per day on phone` <dbl>, `Hrs/day on phone not talking` <dbl>
filter(KimData, Siblings == 0)
## # A tibble: 39 x 25
##    Semester Gender Siblings `Birth Order` `Shoe Size` Height Weight
##       <int> <chr>     <dbl> <chr>               <dbl>  <dbl>  <dbl>
##  1        4 F             0 Only                10.0    64.0    187
##  2        4 F             0 Only                 7.00   64.0    135
##  3       10 M             0 Only                13.0    72.0    210
##  4        7 M             0 Only                11.0    73.0    163
##  5        2 M             0 Only                10.0    65.0    130
##  6        4 F             0 Only                 7.00   62.0    135
##  7        4 F             0 Only                 7.00   64.0    105
##  8        1 M             0 Only                 8.00   65.0    135
##  9        3 M             0 Only                 9.50   62.0    120
## 10        4 F             0 Only                 7.00   63.0    110
## # ... with 29 more rows, and 18 more variables: `dog vs. cat` <chr>,
## #   Handed <chr>, `On/Off Campus` <chr>, `Calories per day` <int>,
## #   `Servings of Fruit` <dbl>, `Cups of Water` <dbl>, `Cups of Coffee`
## #   <dbl>, `Hours of Sleep` <dbl>, `Hours spent studying per week` <dbl>,
## #   `Hours spent working per week` <dbl>, `Hours spent workingout per wee`
## #   <dbl>, `Hours socializing per w` <dbl>, `Politically Liberal` <int>,
## #   `Religiously C or L` <int>, `Socially C or L` <int>, Phone <chr>, `Hrs
## #   per day on phone` <dbl>, `Hrs/day on phone not talking` <dbl>

Stinky Boys

Notice that this has n=162 observations, not 763

StinkyBoys <- filter(KimData, Gender == "M") 
View(StinkyBoys)

Upper Class

We can also use fancier mathematical “relational operators.” Notice that this has n=103 observations, all with semester 4 or higher. ?== gives you a large list of relational operators. “!=” means “not equal to”

UpperClass <- filter(KimData, Semester > 3) 
View(UpperClass)

Freshlings

You can use the | character to mean “or” Notice that you have to write Semester twice– you can’t do “Semester ==1 | 0” You can use the & character to mean “and” (although not for a single variable - that’s OR) ?& #gives you a large list of logical operators.

Freshlings <- filter(KimData, Semester == 1 | Semester ==0)
Freshlings <- filter(KimData, Semester ==1 & Gender == "M")  #find freshman males
View(Freshlings)

2. Arrange

Arrange sorts your data into a certain order. So, if we want the data in order from newest at Truman to senior-est. Missing values and NAs go at the end.

FreshFirst <- arrange(KimData, Semester)

3. Select

Select selects by variable (kind of the transpose of filter, which works on rows, not columns). This is especially helpful for super-giant datasets, so you can make something smaller to work with. If we just want the first few physical variables, we can list them. Those weird quotes (upper left of the keyboard, same as RMarkdown) are needed for spaces, etc.

KimDataPhysical <- select(KimData, Semester, Gender, `Shoe Size`, Height, Weight, Handed)   
View(KimDataPhysical)

If we just want the numerical variables, we can go through and note their column numbers Rename is a variation of select that simply changes the name of a variable. You can see how the name changes in the environment window. It would be annoying, but possible, to rename variables with the select command KimDataPhysical <- select(KimDataPhysical, Shoe.Size = Shoe Size, everything) This works, but moves Shoe.Size to the first column

KimNumData <- select(KimData, c(1,3,5:7, 12, 13:22))
View(KimNumData)
KimDataP2 <- rename(KimDataPhysical, Shoe.Size = `Shoe Size`)

4. Mutate

Mutate creates a new variable that is a function using existing ones This is especially helpful for data cleaning, or changing units, data types or whatever.

KimDataP3 <- mutate(KimDataPhysical, Heightcm = round(Height/2.54,0))
KimDataP4 <- mutate(KimDataPhysical, Gender=as.factor(Gender))

If you look at Shoe.Size, you can see one outlier.

summary(KimDataPhysical)
##     Semester         Gender            Shoe Size           Height    
##  Min.   : 0.000   Length:377         Min.   :  5.000   Min.   :45.0  
##  1st Qu.: 1.000   Class :character   1st Qu.:  7.500   1st Qu.:64.0  
##  Median : 2.000   Mode  :character   Median :  9.000   Median :67.0  
##  Mean   : 2.729                      Mean   :  9.498   Mean   :66.8  
##  3rd Qu.: 4.000                      3rd Qu.: 10.500   3rd Qu.:70.0  
##  Max.   :10.000                      Max.   :113.000   Max.   :77.0  
##                                      NA's   :1         NA's   :17    
##      Weight         Handed         
##  Min.   :  0.0   Length:377        
##  1st Qu.:127.0   Class :character  
##  Median :146.0   Mode  :character  
##  Mean   :152.2                     
##  3rd Qu.:175.0                     
##  Max.   :300.0                     
##  NA's   :11

One guy has a size 113 shoe. It should be size 13. sub is the command in base R used for substituting one value for another. sub converts a variable to a character string, so we have to turn it back into a number.

KimDataP5<- mutate(KimDataP2, Shoe.Size=as.numeric(sub(113, 13, Shoe.Size)))
summary(KimDataP5$Shoe.Size)    
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   5.000   7.500   9.000   9.232  10.500  16.000       1

That’s better. 16 is still big, but realistic. Mutate can get tricky pretty quickly. Maybe we want to convert Gender into a factor, and turn the missing one into an NA.

KimDataP4<- mutate(KimDataPhysical, Gender=as.factor(sub("other", NA, Gender)))

The book has a list of helpful commands to include in mutate functions http://r4ds.had.co.nz/transform.html#mutate-funs

We made new datasets for each of these, but often you would just stack the changes at once. The tidyr package (coming soon) allows multiple changes and other tricks for data cleaning.

5. Summarize (or summarise, if you’re from New Zealand), often with group_by

By itself, summarize is a pokey way to do simple computations.

summarise(KimData, MeanWeight = mean(Weight, na.rm = TRUE))
## # A tibble: 1 x 1
##   MeanWeight
##        <dbl>
## 1        152
summarise(KimData, MeanWeight = mean(Weight))
## # A tibble: 1 x 1
##   MeanWeight
##        <dbl>
## 1         NA

With group_by, it becomes a magical machine

by_gender <- group_by(KimData, Gender)
View(by_gender) 

It doesn’t actually do anything to the data itself.

by_gender       
## # A tibble: 377 x 25
## # Groups: Gender [3]
##    Semester Gender Siblings `Birth Order` `Shoe Size` Height Weight
##       <int> <chr>     <dbl> <chr>               <dbl>  <dbl>  <dbl>
##  1        6 F          5.00 Middle              11.0    71.0    195
##  2        4 F          0    Only                10.0    64.0    187
##  3        6 F          1.00 Last                 9.50   69.0    150
##  4        7 F          3.00 First                9.50   64.0    193
##  5        6 M          2.00 Middle              13.0    73.0    181
##  6        6 M          2.00 First               10.0    68.0    167
##  7        6 M          3.00 First               13.0    73.0    190
##  8        9 M          1.00 Last                12.0    74.0    195
##  9        2 F          3.00 Middle              11.0    69.5    180
## 10        4 M          3.00 Last                11.0    72.5    175
## # ... with 367 more rows, and 18 more variables: `dog vs. cat` <chr>,
## #   Handed <chr>, `On/Off Campus` <chr>, `Calories per day` <int>,
## #   `Servings of Fruit` <dbl>, `Cups of Water` <dbl>, `Cups of Coffee`
## #   <dbl>, `Hours of Sleep` <dbl>, `Hours spent studying per week` <dbl>,
## #   `Hours spent working per week` <dbl>, `Hours spent workingout per wee`
## #   <dbl>, `Hours socializing per w` <dbl>, `Politically Liberal` <int>,
## #   `Religiously C or L` <int>, `Socially C or L` <int>, Phone <chr>, `Hrs
## #   per day on phone` <dbl>, `Hrs/day on phone not talking` <dbl>

If it’s a tibble, it starts by telling you about the groups.

summarise(by_gender, MeanWeight = mean(Weight, na.rm = TRUE)) 
## # A tibble: 3 x 2
##   Gender MeanWeight
##   <chr>       <dbl>
## 1 F             136
## 2 M             173
## 3 other         115

Snazzy! Do it again, this time by semester.

by_sem <- group_by(KimData, Semester)
summarise(by_sem, MeanWeight = mean(Weight, na.rm = TRUE))
## # A tibble: 11 x 2
##    Semester MeanWeight
##       <int>      <dbl>
##  1        0        155
##  2        1        149
##  3        2        149
##  4        3        161
##  5        4        146
##  6        5        154
##  7        6        156
##  8        7        171
##  9        8        144
## 10        9        195
## 11       10        210

Now, together.

by_Gsem <- group_by(KimData, Gender, Semester)
Gsem_means <- summarise(by_Gsem, MeanWeight = mean(Weight, na.rm = TRUE))

It looks at all of the subgroups. It’s hard to see with that many.

View(Gsem_means)

6. Pipes

A pipe is a way to connect multiple lines of code. It basically means, “take the result of this line down to the next line.” ggplot uses + as a pipe. That’s cool, but un-tidy (because + also means “add these up”) dplyr and most other tidyverse packages use a unique pipe that has no other meaning. %>% No, really, that’s what it looks like. Yes, that’s weird. But, you have to admit that you aren’t going to use %>% for anything else.

tall_Kim <- KimData %>%             #This line just renames the dataset
group_by(Gender,Semester) %>%           #Now, we don’t need to re-type Kimdata
summarize(count = n(),              #this counts up how many in each thing
tall=mean(Height, na.rm=TRUE)) %>%  #this finds the average height
filter(count > 2) %>%               #this gets rid of low-n categories
arrange(tall)                       #this sorts them from shortest to tallest
tall_Kim                        #See what you made?
## # A tibble: 16 x 4
## # Groups: Gender [2]
##    Gender Semester count  tall
##    <chr>     <int> <int> <dbl>
##  1 F             1    60  62.0
##  2 F             5     5  62.9
##  3 F             4    30  64.3
##  4 F             7     5  65.0
##  5 F             2    66  65.2
##  6 F             6    11  65.8
##  7 F             3    28  66.5
##  8 F             0     7  66.5
##  9 M             3    25  68.6
## 10 M             2    44  69.5
## 11 M             8     4  69.8
## 12 M             1    41  69.8
## 13 M             4    17  70.5
## 14 M             7    11  71.0
## 15 M             6     9  71.2
## 16 M             5     7  71.4

This script groups individuals by Gender and Semester, counts the number in each cell, then computes the average height. It eliminates low-n cells, then sorts it smallest to largest. That could be handy, right?

Notice that this script is long and wordy, but easy to understand. When you mix dplyr and ggplot, you have to be careful to get the pipes correct. That can be annoying, but you should keep your graphs away from your data management anyway. How about this? A chart of the average height of gender, by semester (excluding small groups).

ggplot(data=tall_Kim, mapping = aes(x=Semester, y=tall, color=Gender)) +
  geom_point()

Righties

Make a new version of the KimData, named “Righties”

Righties <- KimData 

Select the KimPhysical vars discussed above, plus the 3 conservative/liberal ones.

Righties <- select(KimData, Semester, Gender, `Shoe Size`, Height, Weight, Handed, `Politically Liberal`, `Religiously C or L`, `Socially C or L`)
View(Righties)

Create a new variable, BMI, that calculates Body Mass Index from Height and Weight. Google to find that formula. [weight (lb) / height (in) / height (in)] x 703= BMI

BMI<-mutate(Righties,BMI=round(703)*Weight/(Height*Height),2)

Limit your group to only Right-handed people.

Righties <- filter(Righties, Handed=="Right")

Group your data by Semester

Righties <- arrange(Righties, Semester)

Calculate the average BMI by Semester.

handed_Righties<-BMI %>%
  filter(Handed=="Right") %>%
  group_by(Semester)  %>%
  summarize(count = n(),            
            bodyType=mean(BMI, na.rm=TRUE)) %>% 
  filter(count > 2) %>% 
  arrange(bodyType)
handed_Righties
## # A tibble: 9 x 3
##   Semester count bodyType
##      <int> <int>    <dbl>
## 1        8     6     22.0
## 2        2    95     22.9
## 3        4    44     22.9
## 4        5     9     23.2
## 5        0     9     23.3
## 6        6    20     23.5
## 7        3    47     25.0
## 8        1    91     25.0
## 9        7    13     26.4

include=FALSE and echo=FALSE

In knitr, what’s the difference between include=FALSE and echo=FALSE? Why could that be important?

echo = FALSE hides the code, but will evaluate it and show its output in the knit. include = FALSE hides the code AND the output from the knit, but will evaluate the code silently.

This is important because sometimes you want to show the output and sometimes you don’t.

BMI

Plot #1: Make a scatterplot for all individuals in the full KimData set, x=height, y=BMI

KimDataBMI <- mutate(KimData, BMI = Weight*703/(Height)^2)

ggplot(data = KimDataBMI) + 
  geom_point(mapping = aes(x = Height, y = BMI))
## Warning: Removed 25 rows containing missing values (geom_point).

Plot #2: Color your scatterplot by Handed.

ggplot(data = KimDataBMI) + 
  geom_point(mapping = aes(x = Height, y = BMI, color=Handed))
## Warning: Removed 25 rows containing missing values (geom_point).

Plot #3: Add smooth trendlines to make it snazzy

ggplot(data = KimDataBMI) + 
  geom_jitter(mapping = aes(x = Height, y = BMI, color=Handed)) +
  geom_smooth(mapping = aes (x = Height, y = BMI, color = Handed))
## `geom_smooth()` using method = 'loess'
## Warning: Removed 25 rows containing non-finite values (stat_smooth).
## Warning: Removed 25 rows containing missing values (geom_point).

From this plot, I can tell who is right and left handed and how their heights and BMIs relate. From the curve, I see that left-handed people have a higher BMI when shorter and right handed people have a lower BMI when taller. Also, there is only one ambidextrous person.