Make sure you have nessessay packages in your library and load dataset

library(ggplot2)
library(tidyverse)

## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr

## Conflicts with tidy packages ----------------------------------------------

## filter(): dplyr, stats
## lag():    dplyr, stats

load(file="beer.RData")
str(beer)

## 'data.frame':    212 obs. of  6 variables:
##  $ BEER         : Factor w/ 212 levels "American Amber Lager",..: 1 2 3 5 7 6 9 10 16 17 ...
##  $ Brewery      : Factor w/ 68 levels "A.B. Pripps Bryggerier (Sweden)",..: 61 61 61 4 5 5 2 7 11 11 ...
##  $ Calories     : int  136 132 96 153 95 157 94 155 177 163 ...
##  $ Carbohydrates: num  10.5 10.5 7.6 16 3.2 8.9 2.6 14.2 15.6 13.9 ...
##  $ Alcohol      : num  4.1 4.1 3.2 4.9 4.2 5.9 4.1 4.6 5.2 4.7 ...
##  $ Type         : Factor w/ 2 levels "Domestic","Imported": 1 1 1 1 1 1 1 1 1 1 ...

table(beer$Carbohydrates, exclude=NULL) #See if there is NA in a variable

## 
##   1.9   2.4   2.6   3.1   3.2   3.5   3.9   4.4     5   5.3  5.33   5.5 
##     1     1     2     1     3     1     1     1     1     1     1     1 
##   5.7   5.8   5.9   6.2   6.5   6.6   6.7   6.8     7   7.3   7.4   7.5 
##     1     1     1     4     1     3     1     1     4     1     1     1 
##   7.6   7.8     8   8.3   8.6   8.7   8.9   9.3   9.5   9.6   9.8   9.9 
##     1     1     2     2     1     2     2     1     1     2     1     3 
##    10  10.2  10.4  10.5  10.6  10.8    11  11.1  11.2  11.4  11.5  11.6 
##     5     2     1     3     2     1     3     1     2     3     2     2 
##  11.8  11.9    12  12.1  12.2  12.3  12.5  12.7  12.9    13  13.1  13.3 
##     1     1     7     3     2     1     4     1     1     3     3     3 
##  13.4  13.5  13.7  13.9 13.99    14  14.1  14.2  14.3  14.5  14.6  14.7 
##     2     1     2     3     1     4     3     2     1     1     1     1 
##  14.8    15  15.3  15.6    16  16.2  16.6  16.7  16.8  16.9    17  17.3 
##     1     5     1     1     4     1     1     1     1     1     1     1 
##  17.4  17.9    18  18.4  18.8  18.9  19.3  19.4  19.7  19.9    20  20.2 
##     1     1     2     1     1     1     1     1     1     1     2     1 
##  21.5  21.8  21.9  22.3  23.9    25  32.1  <NA> 
##     1     1     1     1     1     1     1    39

Scatterplots

ggplot(data = beer) + 
  geom_point(mapping = aes(x = Alcohol, y = Calories))

# Scatterplot with regression line: Calories ~ Alcohol
  ggplot(data = beer) + 
  geom_point(mapping = aes(x = Alcohol, y = Calories)) +
  geom_smooth(mapping = aes(x = Alcohol, y = Calories), method=lm)

# Claories ~ Carbohydrates  
  ggplot(data = beer) + 
    geom_point(mapping = aes(x = Carbohydrates, y = Calories)) +
    geom_smooth(mapping = aes(x = Carbohydrates, y = Calories), method=lm)

## Warning: Removed 39 rows containing non-finite values (stat_smooth).

## Warning: Removed 39 rows containing missing values (geom_point).

# The geom_smooth allows you to fit different kinds of lines, including curved lines. 
# The parameter method=lm says that you want a straight linear regression.
  
# Scatterplot with a smooth estimate of the relationship (default is lowess method), i.e. not imposing that the relationship be linear:
  ggplot(data = beer) + 
  geom_point(mapping = aes(x = Alcohol, y = Calories)) +
  geom_smooth(mapping = aes(x = Alcohol, y = Calories))

## `geom_smooth()` using method = 'loess'

# Scatterplot with different colors for different values of a categorical variable called MyVarCat
ggplot(data = beer) + 
  geom_point(mapping = aes(x = Alcohol, y = Calories, color = Type))

Transforming a variable using logarithms

Log transformation is appropriate when the distribution of dependent variable (DV) or independent variable (IV) is right skewed. Cautions: Make sure there is NO zero or negative observation. Otherwise you use log(x + K) to ensure all the values are positive.

# let's check the distribution of the variables. 
hist(beer$Carbohydrates)

hist(beer$Alcohol)

hist(beer$Calories)

# summarize the variable first to see whether you need scalar 'k' when taking the transformation
summary(beer$Calories)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    55.0   135.0   150.0   153.9   163.5   330.0

# To create the log of Carbohydrates, let???s call it logCal:
beer$logCal <- log(beer$Calories)
hist(beer$logCal)

ggplot(data = beer) + 
  geom_point(mapping = aes(x = Alcohol, y = logCal, color = Type)) +
  geom_smooth(mapping = aes(x = Alcohol, y = logCal), method=lm)

Correlation between two variables

cor(beer$Alcohol, beer$Calories)

## [1] 0.9054254

cor.test(beer$Alcohol, beer$Calories)

## 
##  Pearson's product-moment correlation
## 
## data:  beer$Alcohol and beer$Calories
## t = 30.909, df = 210, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8777688 0.9270674
## sample estimates:
##       cor 
## 0.9054254

#When you have missing values in your variable, Carbohydrates in this case,
cor(beer$Carbohydrates, beer$Calories)

## [1] NA

cor(beer$Carbohydrates, beer$Calories, use="complete")

## [1] 0.7953174

cor.test(beer$Carbohydrates, beer$Calories)

## 
##  Pearson's product-moment correlation
## 
## data:  beer$Carbohydrates and beer$Calories
## t = 17.156, df = 171, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.7331090 0.8443283
## sample estimates:
##       cor 
## 0.7953174

#You may want to create a new dataframe that includes only those cases with complte data for the variables of interest.
newbeer<-beer[complete.cases(beer[,3:4,]),]

Linear regression predicting outcome Y based on values of X

# Let???s call our fitted model model1. Then we have:
model1=lm(data=beer, formula= Calories ~ Alcohol) 
# lm stands for Linear Model. If you want to know more about the function lm, try ?lm
# DV(Calories in this case) comes first 
summary(model1) #print the results

## 
## Call:
## lm(formula = Calories ~ Alcohol, data = beer)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -40.153 -13.999   1.616  11.124  54.555 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   9.0217     4.8421   1.863   0.0638 .  
## Alcohol      28.0219     0.9066  30.909   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.61 on 210 degrees of freedom
## Multiple R-squared:  0.8198, Adjusted R-squared:  0.8189 
## F-statistic: 955.3 on 1 and 210 DF,  p-value: < 2.2e-16

# ???lm??? stands for linear model, so the first letter is an L. The first line calculates the results of the linear regression.
# Interpretation: 1 degree increase in Alcohol is associated with 28 calories increase. 

# If we want to see what values of MyVarY the model model1 predicts for each value of MyVarX, we can create a new variable PredMyVarY this way:
beer$PredCalories <- predict(model1)

# Assume that I want to get predictions for three values of MyVarX, x1, x2, x3 and these values are not already in my dataset.
# I create a new dataset MyNewData with these values stored in MyVarX:
MyNewData <- data.frame(Alcohol=c(200,300,500))
#I then predict the new values of the outcomes like this and I store them in a new variable in MyNewData called PredMyVarY:
MyNewData$PredCalories <- predict(model1, newdata=MyNewData)

Scatterplots and linear regression

Seongho An

9/13/2017

Scatterplots

Transforming a variable using logarithms

Correlation between two variables

Linear regression predicting outcome Y based on values of X