R Lab 3

knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)

## -- Attaching packages ------------------------------------------------------------------------------------------- tidyverse 1.2.1 --

## v ggplot2 2.2.1     v purrr   0.2.4
## v tibble  1.4.1     v dplyr   0.7.4
## v tidyr   0.7.2     v stringr 1.2.0
## v readr   1.1.1     v forcats 0.2.0

## -- Conflicts ---------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

KimData <- read.csv("C:/LocalFiles/Documents/Freshman TSU/STAT-220/R Lab 3/Clean-KimData.csv")
#Let’s just select the numerical variables, plus Gender (which is NOT really numerical)
KimNumRaw <- KimData[c(1:3,5:7, 12, 13:15, 20:22)]
# we are now missing 4 Birth Order, dogvcat., Handed, Off/On campus, etc.

Cleaning

Now, let’s clean the data nicely.

KimNum <- KimNumRaw %>%
  replace_na(list(Cups.of.Water=0,Cups.of.Coffee=0)) %>%
  mutate(Shoe.Size=as.numeric(sub(113, 11, Shoe.Size, fixed =TRUE))) %>%
  mutate(Gender=as.factor(Gender))

  summary(KimData$Cups.of.Coffee)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.0000  0.5000  0.8112  1.0000  5.0000     123

  summary(KimNum$Cups.of.Coffee)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.5466  1.0000  5.0000

  summary(KimData$Cups.of.Water) #less people responded NA here (7)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   3.625   5.000   5.689   8.000  32.000       7

Part I. Scatterplots are awesome!

Part Ia. Scatterplots are awesome! Without ggplots, dplyr or the tidyverse at all. Make scatterplots of everything Pairs is a command in base r that does this.

pairs(KimData[1:10])

#Well, maybe not everything. Not so good for non-numerical data
pairs(KimNum[3:13])

#This is still pretty tiny print
pairs(KimNum[c(1,3:7)])

#Add a histogram on the diagonal (script from the help menu for ?pairs)
panel.hist <- function(x, ...)
{
  usr <- par("usr"); on.exit(par(usr))
  par(usr = c(usr[1:2], 0, 1.5) )
  h <- hist(x, plot = FALSE)
  breaks <- h$breaks; nB <- length(breaks)
  y <- h$counts; y <- y/max(y)
  rect(breaks[-nB], 0, breaks[-1], y, col = "cyan", ...)
}
#find your new function in the Environment tab in the upper right.

pairs(KimNum[c(1,3:7)], panel = panel.smooth,
      cex = 1.5, pch = 24, bg = "light blue",
      diag.panel = panel.hist, cex.labels = 1, font.labels = 2)

#Can you understand what each part of the script does?
#cex is a size multiplier, pch selects a character, bg is a background color.

#What if the upper triangle also showed the correlations instead of the mirror scatterplot
panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...)
{
  usr <- par("usr"); on.exit(par(usr))
  par(usr = c(0, 1, 0, 1))
  r <- abs(cor(x, y, use="pairwise.complete.obs"))
  txt <- format(c(r, 0.123456789), digits = digits)[1]
  txt <- paste0(prefix, txt)
  if(missing(cex.cor)) cex.cor <- 1/strwidth(txt)
  text(0.5, 0.5, txt, cex = max(cex.cor * r,1.5))
}
#you can also find this function in the Environment tab in the upper right.
#you do not have to understand how these functions work, but take a second to try.

pairs(KimNum[c(1,3:7)], lower.panel = panel.smooth, upper.panel = panel.cor)

#pretty snazzy, eh?

#Why not both!
#”Complete” version of pairs
pairs(KimNum[c(1,3:7)], lower.panel = panel.smooth, upper.panel = panel.cor,
      bg = "light blue", diag.panel = panel.hist, cex.labels = 1, font.labels = 2)

#Can you understand what each part of this script does?

Part Ib. Scatterplots are awesome! So are ggplots and dplyr

KimNum %>%
  gather(-Semester, -Gender, key = "var", value = "value", na.rm=TRUE) %>%
  ggplot(aes(y = value, x = Semester)) +
  geom_jitter() +
  facet_wrap(~ var, scales = "free")+
  scale_x_continuous(breaks= c(1,3,5,7,9))

#gather is part of the tidyr package, used to combine a bunch of variables.
#Since we didn’t use the -> character, the gather isn’t permanent.
#scale_x_continuous(breaks) labels the x-axis cleanly


#Can add in color for a variable.
GenderColor <- c("orchid", "deepskyblue", "green")
#This uses the order the factors are listed on the environment tab
#Your color sheet has a bunch listed, but there are a zillion choices.
#These are a bit too gender-stereotyped, but are good 1980s colors.
KimNum %>%
  gather(-Semester, -Gender, key = "var", value = "value", na.rm=TRUE) %>%
  ggplot(aes(y = value, x = Semester, color= Gender)) +
  geom_jitter() +
  scale_color_manual(values=GenderColor)+
  facet_wrap(~ var, scales = "free")

#geom_smooth is cool, too (remember lm = linear model)
KimNum %>%
  gather(-Semester, -Gender, key = "var", value = "value", na.rm=TRUE) %>%
  ggplot(aes(y = value, x = Semester, color= Gender)) +
  geom_jitter() +
  stat_smooth(method="lm")+
  scale_color_manual(values=GenderColor)+
  facet_wrap(~ var, scales = "free")

  scale_x_continuous(breaks= c(1,3,5,7,9))

## <ScaleContinuousPosition>
##  Range:  
##  Limits:    0 --    1

#The Lab Assignment asks you to make this chart for a different panel variable.
#method lm means at a line

Part III. Regression is easy!

#Regression is Easy!
#Let’s look just at Height v. Semester, since it looks like something is happening there.
    ggplot(data=KimNum, aes(y = Height, x = Semester)) +
    geom_point()+
    geom_smooth(method="lm")

## Warning: Removed 17 rows containing non-finite values (stat_smooth).

## Warning: Removed 17 rows containing missing values (geom_point).

#Remember how linear regression works. R1 = Slope + Intercept.
#We can calculate the slope and intercept.
  
R1 <- lm(Height ~ Semester, data = KimNum)
  #the stuff before the ~ is the Dependent variable (Y),
  # and the stuff after the ~ is the Independent variable(s) (S)
  #Regression creates a (simple) model for prediction. We will get more complicated soon enough.
  #R1 <- lm(KimNum$Height ~ KimNum$Semester)
  #would do the same thing, if you liked the $ notation better.
  #thing youre predicting, the think you're trying to predict it with.
R1

## 
## Call:
## lm(formula = Height ~ Semester, data = KimNum)
## 
## Coefficients:
## (Intercept)     Semester  
##     65.5363       0.4582

  #intercept - 65.5363, Semester - 0.4582 inches taller
#Remember how regression works. R1 = Slope + Intercept.
#Regression creates a (simple) model for prediction.
#So, for each semester older a STAT 190 student is, s/he is about ½ inch taller?
#Does that make sense? Can you make a causal argument?

summary(R1)

## 
## Call:
## lm(formula = Height ~ Semester, data = KimNum)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -20.9945  -2.4527   0.5891   3.0891  11.4637 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  65.5363     0.4631 141.528  < 2e-16 ***
## Semester      0.4582     0.1388   3.302  0.00106 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.976 on 358 degrees of freedom
##   (17 observations deleted due to missingness)
## Multiple R-squared:  0.02955,    Adjusted R-squared:  0.02684 
## F-statistic:  10.9 on 1 and 358 DF,  p-value: 0.001058

#summary gives a more detailed response.
#R-squared is percent of explained variation = 3% (pretty tiny)
#Does that make sense? Can you make a causal argument?

cor(KimNum$Height,KimNum$Semester, use="pairwise.complete.obs")

## [1] 0.1719027

#Correlation = R, R-squared is percent of explained variation.

plot(y=KimNum$Height,x=KimNum$Semester)
abline(R1) #sticks line on top of plot -- smooth is similar

#the first one plots the data; the second one adds your line of best fit.
#yes, ggplot already did this for us, but it had to do this calculation.

#the lm function creates an object of class “lm,” which is a list of 13 things.

R2 <- lm(Height ~ Gender, data = KimNum)
R2

## 
## Call:
## lm(formula = Height ~ Gender, data = KimNum)
## 
## Coefficients:
## (Intercept)      GenderM  Genderother  
##      64.376        5.565       -2.376

plot(y=KimNum$Height, x=KimNum$Gender) #don't want to do this

#For a factor variable, lm gives a value for the effect (it’s actually an ANOVA, if you cared).
#The first factor (F in this case) is the base, then the others are the change for other factors.
#So, someone who responded M is 5.5 inches taller on average.
#The respondent who didn’t identify a gender is 2.376 inches shorter.
#You can think of factors as a series of binary (0-1) variables.

#We will soon be getting to multiple variables at once (which isn’t much more tricky):
  R1G <- lm(Height ~ Semester + Gender, data = KimNum)
R1G

## 
## Call:
## lm(formula = Height ~ Semester + Gender, data = KimNum)
## 
## Coefficients:
## (Intercept)     Semester      GenderM  Genderother  
##     63.6435       0.2867       5.4356      -2.2169

1) Walk through this lab (we probably finished it in class, but who knows?). If you did it correctly, you’ll have an RMarkdown file with chunks and some thoughtful text.

2) Use the “complete” pairs() command to examine additional columns, 3:7 AND 11:13. Remember that 11:13 ask about conservative/liberal attitudes in 3 areas.

pairs(KimNum[c(3:7, 11:13)], lower.panel = panel.smooth, upper.panel = panel.cor,
      bg = "light blue", diag.panel = panel.hist, cex.labels = 1, font.labels = 2)

  #Does it look like 11:13 correlate to each other?

#It appears that 11:13 do correlate to each other, but not at an extremely high amount (0.58, 0.58, and 0.72).

  #Which of the original variables (3:7) appear to correlate with 11:13?

#It appears that shoe size and weight have a slight correlation with those who are politically liberal (0.27 and 0.22).

  #Which is the largest negative correlation? Interpret that pairwise relationship.

#The largest negative correlation is between weight and politically liberal.  The regression line on the graph has the steepest
#downhill slope.  This shows us that as weight decreases, politically liberal-ness decreases.

3) Use a ggplot panel to examine Politically.Liberal (v11) against the other KimNum variables. Those who identify as politically liberal have a higher value (up to +2) than those who identify as politically conservative (down to -2)

     #a) First, make a new GenderColor vector. Since you reject Gender Norms, use:
        #F: “darkseagreen3”
        #M: “mediumpurple2”
        #other: “firebrick”
GenderColor <- c("darkseagreen3", "mediumpurple2", "firebrick")
KimNum %>%
  gather(-Politically.Liberal, -Gender, key = "var", value = "value", na.rm=TRUE) %>%
  ggplot(aes(y = value, x = Politically.Liberal, color= Gender)) +
  geom_jitter() +
  stat_smooth(method="lm")+
  scale_color_manual(values=GenderColor)+
  facet_wrap(~ var, scales = "free")

## Warning: Removed 56 rows containing non-finite values (stat_smooth).

## Warning: Removed 56 rows containing missing values (geom_point).

#Looking at Politically.Liberal against Cups.of.Coffee and Cups.of.Water, does it seem fair to say that, among this sample, self-identified liberal women tend to drink less water and more coffee than respondents who responded as conservative?

#No -- from the graphs, it is not reasonable to say that self-identified liberal women tend to drink less water and more coffee because the regression line on the graphs is fairly horizontal on both graphs.  There is not a statistical difference.

#Looking at Politically.Liberal against Height, Weight, and Shoe.Size, is it fair to say that more politically liberal men tend to be smaller than conservative men, at least among this sample?
  
#Among the sample, it is fair to say that more politcally liberal men are smaller than conservative men.  Each line on the graphs slope downward as the individual gets more liberal.

4) Compute several simple regressions using lm(). For each of these three models:

 #i Display the slope and intercept (just type R2 or whatever)
R2

## 
## Call:
## lm(formula = Height ~ Gender, data = KimNum)
## 
## Coefficients:
## (Intercept)      GenderM  Genderother  
##      64.376        5.565       -2.376

    #ii Look at the summary and simple plot (see both pairs and panel, above)
    #iii Explain anything interesting you see (using words and thinking and stuff).
#The intercept is 64.376, the slope for GenderM is 5.565 and Genderother is -2.376
#Someone who responded M is 5.5 inches taller on average and the person who didn't say a gender is 2.376 inches shorter.

R3 <- lm(Politically.Liberal ~ Semester, data = KimNum)
R3

## 
## Call:
## lm(formula = Politically.Liberal ~ Semester, data = KimNum)
## 
## Coefficients:
## (Intercept)     Semester  
##     0.34312     -0.06675

R4 <- lm(Politically.Liberal ~ Shoe.Size, data = KimNum)
R4

## 
## Call:
## lm(formula = Politically.Liberal ~ Shoe.Size, data = KimNum)
## 
## Coefficients:
## (Intercept)    Shoe.Size  
##      1.4358      -0.1379

R5 <- lm(Politically.Liberal ~ Socially.C.or.L, data = KimNum)
R5

## 
## Call:
## lm(formula = Politically.Liberal ~ Socially.C.or.L, data = KimNum)
## 
## Coefficients:
##     (Intercept)  Socially.C.or.L  
##         -0.1690           0.7208

R6 <- lm(Politically.Liberal ~ Gender, data = KimNum)
R6

## 
## Call:
## lm(formula = Politically.Liberal ~ Gender, data = KimNum)
## 
## Coefficients:
## (Intercept)      GenderM  Genderother  
##      0.3095      -0.3473       0.6905

5) We will soon be getting to multiple variables at once (which isn’t much more tricky):

R3G <- lm(Politically.Liberal ~ Shoe.Size + Gender, data = KimNum)
R3G

## 
## Call:
## lm(formula = Politically.Liberal ~ Shoe.Size + Gender, data = KimNum)
## 
## Coefficients:
## (Intercept)    Shoe.Size      GenderM  Genderother  
##     1.54618     -0.15467      0.09999      0.53654

#(notice that Gender gives extra for Men (GenderM) and GenderOther).
#How is this different than what you found in 4c)?

#This is different than what I found in 4c because the intercept is bigger and the shoe.size is smaller with this method
#It also has GenderM and Genderother reported.

Knit an RMarkdown file (start from the one you already have) to submit all of this. Upload your .html file and your .rmd file into Google Classroom.