use the following code for necessary packages

library(ggplot2)
library(tidyverse)

## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr

## Conflicts with tidy packages ----------------------------------------------

## filter(): dplyr, stats
## lag():    dplyr, stats

Load the Beer data

load(file = "beer.RData")

Creating a residuals variable

Let’s call our fitted model model1. Then we have:

model1=lm(data=beer, formula= Calories ~ Alcohol) #After running this function, you should be able to see 'model1' at the top right window of R-studio

summary(model1) #this function will display the result of your regression model at the bottom window

## 
## Call:
## lm(formula = Calories ~ Alcohol, data = beer)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -40.153 -13.999   1.616  11.124  54.555 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   9.0217     4.8421   1.863   0.0638 .  
## Alcohol      28.0219     0.9066  30.909   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.61 on 210 degrees of freedom
## Multiple R-squared:  0.8198, Adjusted R-squared:  0.8189 
## F-statistic: 955.3 on 1 and 210 DF,  p-value: < 2.2e-16

If you want to see predicted value Y (Calories in this case) for each value of Alcohol

beer$PredCal <- predict(model1) # create a new variable with the predicted values based on model1
beer$PredCal <- round(beer$PredCal, digits = 2) #If you want to roundup the values

The residual value for each observation can be created: Calories - PredCal

beer$residCal <- beer$Calories- beer$PredCal

# Or you can create the residuals in one step immediately after fitting the model:
beer$ResidCal01 <- residuals(model1)

Plotting the Residuals

ggplot(data = beer) + 
  geom_point(mapping = aes(x = Alcohol, y = residCal)) +
  geom_hline(yintercept=0,col="red") #hline is for horizontal line y=0

Drawing regression lines while excluding certain observations

Typically, you want to exclude outliers you spotted on the scatter plot. So this would be an observation whose x (or y) value is above a high number or below a low number. For example, assume you want to exclude observations for which Calories >= B, with B a large number. Then we can only include observations with Calories < B using the subset function.

ggplot(beer) + 
  geom_point(mapping = aes(x = Alcohol, y = Calories)) +
  geom_smooth(mapping = aes(x = Alcohol, y = Calories), method=lm, se=FALSE) +  #the regression line including all observations
  geom_smooth(mapping = aes(x = Alcohol, y = Calories, col="red"), method = lm, se=FALSE, subset(beer, Alcohol < 10)) #the regression line only including observations with Alcohol < 9, and plots this in red for added salience.

Two-way table

Creating tables to describe the relationship between two categorical variables

You can convert a quantitative variable into a categorical variable. For example, you can create a categorical variable that describes percentiles of the distribution.

To create a categorical variable Cat MyVarX that indicates if MyVarX is below or above the median:

beer$CatAlcohol <- cut(beer$Alcohol, breaks=quantile(beer$Alcohol, probs=seq(0, 1, by=1/2)),include.lowest=TRUE)
#by=1/2 to divide the data in half. If you wanted thirds of data you could say by=1/3, and if you wanted quartiles, by=1/4.

#you can make labels for each category
beer$CatAlcohol <- cut(beer$Alcohol, breaks=quantile(beer$Alcohol, probs=seq(0, 1, by=1/2)),include.lowest=TRUE, labels = c ("low", "high"))

You can create a two way table of frequencies called MyTable using the table command and the two columns in the data frame:

MyTable <- table(beer$CatAlcohol, beer$Type)
print(MyTable)

##       
##        Domestic Imported
##   low        80       27
##   high       79       26

If we want to calculate proportions, margins, etc,

#To display the joint distribution (which gives the proportion for each cell):
prop.table(MyTable)

##       
##         Domestic  Imported
##   low  0.3773585 0.1273585
##   high 0.3726415 0.1226415

80/(80+79+27+26)

## [1] 0.3773585

#To display the conditional distribution for Type given CatAlcohol (the first variable in MyTable):
prop.table(MyTable,1)

##       
##         Domestic  Imported
##   low  0.7476636 0.2523364
##   high 0.7523810 0.2476190

80/(80+27)

## [1] 0.7476636

#To display the conditional distribution for CatAlcohol given Type (the second variable in MyTable):
prop.table(MyTable,2)

##       
##         Domestic  Imported
##   low  0.5031447 0.5094340
##   high 0.4968553 0.4905660

80/(80+79)

## [1] 0.5031447

Creating a mosaic plot to describe the relationship between two categorical variables

mosaicplot(MyTable, col ="grey")

Linear regression and residuals

Seongho An

9/21/2017

Creating a residuals variable

Plotting the Residuals

Drawing regression lines while excluding certain observations

Two-way table