use the following code for necessary packages
library(ggplot2)
library(tidyverse)
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
Load the Beer data
load(file = "beer.RData")
Let’s call our fitted model model1. Then we have:
model1=lm(data=beer, formula= Calories ~ Alcohol) #After running this function, you should be able to see 'model1' at the top right window of R-studio
summary(model1) #this function will display the result of your regression model at the bottom window
##
## Call:
## lm(formula = Calories ~ Alcohol, data = beer)
##
## Residuals:
## Min 1Q Median 3Q Max
## -40.153 -13.999 1.616 11.124 54.555
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.0217 4.8421 1.863 0.0638 .
## Alcohol 28.0219 0.9066 30.909 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.61 on 210 degrees of freedom
## Multiple R-squared: 0.8198, Adjusted R-squared: 0.8189
## F-statistic: 955.3 on 1 and 210 DF, p-value: < 2.2e-16
If you want to see predicted value Y (Calories in this case) for each value of Alcohol
beer$PredCal <- predict(model1) # create a new variable with the predicted values based on model1
beer$PredCal <- round(beer$PredCal, digits = 2) #If you want to roundup the values
The residual value for each observation can be created: Calories - PredCal
beer$residCal <- beer$Calories- beer$PredCal
# Or you can create the residuals in one step immediately after fitting the model:
beer$ResidCal01 <- residuals(model1)
ggplot(data = beer) +
geom_point(mapping = aes(x = Alcohol, y = residCal)) +
geom_hline(yintercept=0,col="red") #hline is for horizontal line y=0
Typically, you want to exclude outliers you spotted on the scatter plot. So this would be an observation whose x (or y) value is above a high number or below a low number. For example, assume you want to exclude observations for which Calories >= B, with B a large number. Then we can only include observations with Calories < B using the subset function.
ggplot(beer) +
geom_point(mapping = aes(x = Alcohol, y = Calories)) +
geom_smooth(mapping = aes(x = Alcohol, y = Calories), method=lm, se=FALSE) + #the regression line including all observations
geom_smooth(mapping = aes(x = Alcohol, y = Calories, col="red"), method = lm, se=FALSE, subset(beer, Alcohol < 10)) #the regression line only including observations with Alcohol < 9, and plots this in red for added salience.
Creating tables to describe the relationship between two categorical variables
You can convert a quantitative variable into a categorical variable. For example, you can create a categorical variable that describes percentiles of the distribution.
To create a categorical variable Cat MyVarX that indicates if MyVarX is below or above the median:
beer$CatAlcohol <- cut(beer$Alcohol, breaks=quantile(beer$Alcohol, probs=seq(0, 1, by=1/2)),include.lowest=TRUE)
#by=1/2 to divide the data in half. If you wanted thirds of data you could say by=1/3, and if you wanted quartiles, by=1/4.
#you can make labels for each category
beer$CatAlcohol <- cut(beer$Alcohol, breaks=quantile(beer$Alcohol, probs=seq(0, 1, by=1/2)),include.lowest=TRUE, labels = c ("low", "high"))
You can create a two way table of frequencies called MyTable using the table command and the two columns in the data frame:
MyTable <- table(beer$CatAlcohol, beer$Type)
print(MyTable)
##
## Domestic Imported
## low 80 27
## high 79 26
If we want to calculate proportions, margins, etc,
#To display the joint distribution (which gives the proportion for each cell):
prop.table(MyTable)
##
## Domestic Imported
## low 0.3773585 0.1273585
## high 0.3726415 0.1226415
80/(80+79+27+26)
## [1] 0.3773585
#To display the conditional distribution for Type given CatAlcohol (the first variable in MyTable):
prop.table(MyTable,1)
##
## Domestic Imported
## low 0.7476636 0.2523364
## high 0.7523810 0.2476190
80/(80+27)
## [1] 0.7476636
#To display the conditional distribution for CatAlcohol given Type (the second variable in MyTable):
prop.table(MyTable,2)
##
## Domestic Imported
## low 0.5031447 0.5094340
## high 0.4968553 0.4905660
80/(80+79)
## [1] 0.5031447
Creating a mosaic plot to describe the relationship between two categorical variables
mosaicplot(MyTable, col ="grey")