Make sure you have nessessay packages in your library and load dataset
library(ggplot2)
library(tidyverse)
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
load(file="beer.RData")
str(beer)
## 'data.frame': 212 obs. of 6 variables:
## $ BEER : Factor w/ 212 levels "American Amber Lager",..: 1 2 3 5 7 6 9 10 16 17 ...
## $ Brewery : Factor w/ 68 levels "A.B. Pripps Bryggerier (Sweden)",..: 61 61 61 4 5 5 2 7 11 11 ...
## $ Calories : int 136 132 96 153 95 157 94 155 177 163 ...
## $ Carbohydrates: num 10.5 10.5 7.6 16 3.2 8.9 2.6 14.2 15.6 13.9 ...
## $ Alcohol : num 4.1 4.1 3.2 4.9 4.2 5.9 4.1 4.6 5.2 4.7 ...
## $ Type : Factor w/ 2 levels "Domestic","Imported": 1 1 1 1 1 1 1 1 1 1 ...
table(beer$Carbohydrates, exclude=NULL) #See if there is NA in a variable
##
## 1.9 2.4 2.6 3.1 3.2 3.5 3.9 4.4 5 5.3 5.33 5.5
## 1 1 2 1 3 1 1 1 1 1 1 1
## 5.7 5.8 5.9 6.2 6.5 6.6 6.7 6.8 7 7.3 7.4 7.5
## 1 1 1 4 1 3 1 1 4 1 1 1
## 7.6 7.8 8 8.3 8.6 8.7 8.9 9.3 9.5 9.6 9.8 9.9
## 1 1 2 2 1 2 2 1 1 2 1 3
## 10 10.2 10.4 10.5 10.6 10.8 11 11.1 11.2 11.4 11.5 11.6
## 5 2 1 3 2 1 3 1 2 3 2 2
## 11.8 11.9 12 12.1 12.2 12.3 12.5 12.7 12.9 13 13.1 13.3
## 1 1 7 3 2 1 4 1 1 3 3 3
## 13.4 13.5 13.7 13.9 13.99 14 14.1 14.2 14.3 14.5 14.6 14.7
## 2 1 2 3 1 4 3 2 1 1 1 1
## 14.8 15 15.3 15.6 16 16.2 16.6 16.7 16.8 16.9 17 17.3
## 1 5 1 1 4 1 1 1 1 1 1 1
## 17.4 17.9 18 18.4 18.8 18.9 19.3 19.4 19.7 19.9 20 20.2
## 1 1 2 1 1 1 1 1 1 1 2 1
## 21.5 21.8 21.9 22.3 23.9 25 32.1 <NA>
## 1 1 1 1 1 1 1 39
ggplot(data = beer) +
geom_point(mapping = aes(x = Alcohol, y = Calories))
# Scatterplot with regression line: Calories ~ Alcohol
ggplot(data = beer) +
geom_point(mapping = aes(x = Alcohol, y = Calories)) +
geom_smooth(mapping = aes(x = Alcohol, y = Calories), method=lm)
# Claories ~ Carbohydrates
ggplot(data = beer) +
geom_point(mapping = aes(x = Carbohydrates, y = Calories)) +
geom_smooth(mapping = aes(x = Carbohydrates, y = Calories), method=lm)
## Warning: Removed 39 rows containing non-finite values (stat_smooth).
## Warning: Removed 39 rows containing missing values (geom_point).
# The geom_smooth allows you to fit different kinds of lines, including curved lines.
# The parameter method=lm says that you want a straight linear regression.
# Scatterplot with a smooth estimate of the relationship (default is lowess method), i.e. not imposing that the relationship be linear:
ggplot(data = beer) +
geom_point(mapping = aes(x = Alcohol, y = Calories)) +
geom_smooth(mapping = aes(x = Alcohol, y = Calories))
## `geom_smooth()` using method = 'loess'
# Scatterplot with different colors for different values of a categorical variable called MyVarCat
ggplot(data = beer) +
geom_point(mapping = aes(x = Alcohol, y = Calories, color = Type))
Log transformation is appropriate when the distribution of dependent variable (DV) or independent variable (IV) is right skewed. Cautions: Make sure there is NO zero or negative observation. Otherwise you use log(x + K) to ensure all the values are positive.
# let's check the distribution of the variables.
hist(beer$Carbohydrates)
hist(beer$Alcohol)
hist(beer$Calories)
# summarize the variable first to see whether you need scalar 'k' when taking the transformation
summary(beer$Calories)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 55.0 135.0 150.0 153.9 163.5 330.0
# To create the log of Carbohydrates, let???s call it logCal:
beer$logCal <- log(beer$Calories)
hist(beer$logCal)
ggplot(data = beer) +
geom_point(mapping = aes(x = Alcohol, y = logCal, color = Type)) +
geom_smooth(mapping = aes(x = Alcohol, y = logCal), method=lm)
cor(beer$Alcohol, beer$Calories)
## [1] 0.9054254
cor.test(beer$Alcohol, beer$Calories)
##
## Pearson's product-moment correlation
##
## data: beer$Alcohol and beer$Calories
## t = 30.909, df = 210, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8777688 0.9270674
## sample estimates:
## cor
## 0.9054254
#When you have missing values in your variable, Carbohydrates in this case,
cor(beer$Carbohydrates, beer$Calories)
## [1] NA
cor(beer$Carbohydrates, beer$Calories, use="complete")
## [1] 0.7953174
cor.test(beer$Carbohydrates, beer$Calories)
##
## Pearson's product-moment correlation
##
## data: beer$Carbohydrates and beer$Calories
## t = 17.156, df = 171, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7331090 0.8443283
## sample estimates:
## cor
## 0.7953174
#You may want to create a new dataframe that includes only those cases with complte data for the variables of interest.
newbeer<-beer[complete.cases(beer[,3:4,]),]
# Let???s call our fitted model model1. Then we have:
model1=lm(data=beer, formula= Calories ~ Alcohol)
# lm stands for Linear Model. If you want to know more about the function lm, try ?lm
# DV(Calories in this case) comes first
summary(model1) #print the results
##
## Call:
## lm(formula = Calories ~ Alcohol, data = beer)
##
## Residuals:
## Min 1Q Median 3Q Max
## -40.153 -13.999 1.616 11.124 54.555
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.0217 4.8421 1.863 0.0638 .
## Alcohol 28.0219 0.9066 30.909 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.61 on 210 degrees of freedom
## Multiple R-squared: 0.8198, Adjusted R-squared: 0.8189
## F-statistic: 955.3 on 1 and 210 DF, p-value: < 2.2e-16
# ???lm??? stands for linear model, so the first letter is an L. The first line calculates the results of the linear regression.
# Interpretation: 1 degree increase in Alcohol is associated with 28 calories increase.
# If we want to see what values of MyVarY the model model1 predicts for each value of MyVarX, we can create a new variable PredMyVarY this way:
beer$PredCalories <- predict(model1)
# Assume that I want to get predictions for three values of MyVarX, x1, x2, x3 and these values are not already in my dataset.
# I create a new dataset MyNewData with these values stored in MyVarX:
MyNewData <- data.frame(Alcohol=c(200,300,500))
#I then predict the new values of the outcomes like this and I store them in a new variable in MyNewData called PredMyVarY:
MyNewData$PredCalories <- predict(model1, newdata=MyNewData)