This question involvese the use of multiple linear regression with categorical variables on the “Carseat” dataset from the textbook (using library(ISLR))
library(ISLR)
## Warning: package 'ISLR' was built under R version 3.6.2
data("Carseats")
str(Carseats)
## 'data.frame': 400 obs. of 11 variables:
## $ Sales : num 9.5 11.22 10.06 7.4 4.15 ...
## $ CompPrice : num 138 111 113 117 141 124 115 136 132 132 ...
## $ Income : num 73 48 35 100 64 113 105 81 110 113 ...
## $ Advertising: num 11 16 10 4 3 13 0 15 0 0 ...
## $ Population : num 276 260 269 466 340 501 45 425 108 131 ...
## $ Price : num 120 83 80 97 128 72 108 120 124 124 ...
## $ ShelveLoc : Factor w/ 3 levels "Bad","Good","Medium": 1 2 3 3 1 1 3 2 3 3 ...
## $ Age : num 42 65 59 55 38 78 71 67 76 76 ...
## $ Education : num 17 10 12 14 13 16 15 10 10 17 ...
## $ Urban : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 2 2 1 1 ...
## $ US : Factor w/ 2 levels "No","Yes": 2 2 2 2 1 2 1 2 1 2 ...
carmod<-lm(Sales~Price+Urban+US, data = Carseats)
summary(carmod)
##
## Call:
## lm(formula = Sales ~ Price + Urban + US, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9206 -1.6220 -0.0564 1.5786 7.0581
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.043469 0.651012 20.036 < 2e-16 ***
## Price -0.054459 0.005242 -10.389 < 2e-16 ***
## UrbanYes -0.021916 0.271650 -0.081 0.936
## USYes 1.200573 0.259042 4.635 4.86e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2335
## F-statistic: 41.52 on 3 and 396 DF, p-value: < 2.2e-16
### yi = beta_not + beta_one_xi + alpha_one + alpha_two + epsilon_i
### Where beta_not is "Sales", beta_one_xi is "Price", alpha_one is "Urban", and alpha_two is "US". Epsilon is random noise generated.
summary(carmod)
##
## Call:
## lm(formula = Sales ~ Price + Urban + US, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9206 -1.6220 -0.0564 1.5786 7.0581
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.043469 0.651012 20.036 < 2e-16 ***
## Price -0.054459 0.005242 -10.389 < 2e-16 ***
## UrbanYes -0.021916 0.271650 -0.081 0.936
## USYes 1.200573 0.259042 4.635 4.86e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2335
## F-statistic: 41.52 on 3 and 396 DF, p-value: < 2.2e-16
carmod2<-lm(Sales~Price+US, data = Carseats)
summary(carmod2)
##
## Call:
## lm(formula = Sales ~ Price + US, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9269 -1.6286 -0.0574 1.5766 7.0515
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.03079 0.63098 20.652 < 2e-16 ***
## Price -0.05448 0.00523 -10.416 < 2e-16 ***
## USYes 1.19964 0.25846 4.641 4.71e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.469 on 397 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2354
## F-statistic: 62.43 on 2 and 397 DF, p-value: < 2.2e-16
anova(carmod)
## Analysis of Variance Table
##
## Response: Sales
## Df Sum Sq Mean Sq F value Pr(>F)
## Price 1 630.03 630.03 103.0603 < 2.2e-16 ***
## Urban 1 0.10 0.10 0.0158 0.9001
## US 1 131.31 131.31 21.4802 4.86e-06 ***
## Residuals 396 2420.83 6.11
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(carmod2)
## Analysis of Variance Table
##
## Response: Sales
## Df Sum Sq Mean Sq F value Pr(>F)
## Price 1 630.03 630.03 103.319 < 2.2e-16 ***
## US 1 131.37 131.37 21.543 4.707e-06 ***
## Residuals 397 2420.87 6.10
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
confint(carmod)
## 2.5 % 97.5 %
## (Intercept) 11.76359670 14.32334118
## Price -0.06476419 -0.04415351
## UrbanYes -0.55597316 0.51214085
## USYes 0.69130419 1.70984121
Use your regression project dataset to explore models that incorporate categorical predictors.
library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1 v purrr 0.3.2
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 1.0.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ---------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
data<-read.csv("Minimum Wage Data.csv", header=TRUE)
#create new column with average value from high and low value
data$Average.Value <- (data$High.Value + data$Low.Value)/2
minwage <- filter(data, Year %in% c("2012", "2013", "2014", "2015", "2016", "2017"))
data2 <- read.csv("Emp. Rates 2012.csv", header = TRUE)
data2$Year <- "2012"
names(data2) <- c("State","Population", "Employed", "Unemployed", "Year")
data3 <- read.csv("Emp. Rates 2013.csv", header = TRUE)
data3$Year <- "2013"
names(data3) <- c("State","Population", "Employed", "Unemployed", "Year")
data4 <- read.csv("Emp. Rates 2014.csv", header = TRUE)
data4$Year <- "2014"
names(data4) <- c("State","Population", "Employed", "Unemployed", "Year")
data5 <- read.csv("Emp. Rates 2015.csv", header = TRUE)
data5$Year <- "2015"
names(data5) <- c("State","Population", "Employed", "Unemployed", "Year")
data6 <- read.csv("Emp. Rates 2016.csv", header = TRUE)
data6$Year <- "2016"
names(data6) <- c("State","Population", "Employed", "Unemployed", "Year")
data7<-read.csv("Emp. Rates 2017.csv", header = TRUE)
data7$Year<-"2017"
names(data7)<-c("State","Population", "Employed", "Unemployed", "Year")
join<-rbind(data2, data3, data4, data5, data6, data7)
excludeStates<-c("Alabama", "Louisiana", "Tennessee", "South Carolina", "U.S. Virgin Islands", "Mississippi", "Puerto Rico", "District of Columbia", "Federal (FLSA)", "Guam")
employmentdata<-join%>%
filter(!State %in% excludeStates)
minwage<-minwage%>%
filter(!State %in% excludeStates)
employmentdata$Year<-as.integer(employmentdata$Year)
joined<-inner_join(minwage, employmentdata)
## Joining, by = c("Year", "State")
## Warning: Column `State` joining factors with different levels, coercing to
## character vector
joined<-joined[, -c(3,4)]
mod<-lm(Average.Value~Population, data = joined)
summary(mod)
##
## Call:
## lm(formula = Average.Value ~ Population, data = joined)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.0019 -0.4156 -0.1696 0.5764 3.3066
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.548e+00 9.001e-02 83.855 <2e-16 ***
## Population 2.571e-08 1.141e-08 2.253 0.025 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.108 on 268 degrees of freedom
## Multiple R-squared: 0.01859, Adjusted R-squared: 0.01493
## F-statistic: 5.078 on 1 and 268 DF, p-value: 0.02505
plot(mod)
NE.name <- c("Connecticut","Maine","Massachusetts","New Hampshire",
"Rhode Island","Vermont","New Jersey","New York",
"Pennsylvania")
NE.abrv <- c("CT","ME","MA","NH","RI","VT","NJ","NY","PA")
NE.ref <- c(NE.name,NE.abrv)
MW.name <- c("Indiana","Illinois","Michigan","Ohio","Wisconsin",
"Iowa","Kansas","Minnesota","Missouri","Nebraska",
"North Dakota","South Dakota")
MW.abrv <- c("IN","IL","MI","OH","WI","IA","KS","MN","MO","NE",
"ND","SD")
MW.ref <- c(MW.name,MW.abrv)
S.name <- c("Delaware","District of Columbia","Florida","Georgia",
"Maryland","North Carolina","South Carolina","Virginia",
"West Virginia","Alabama","Kentucky","Mississippi",
"Tennessee","Arkansas","Louisiana","Oklahoma","Texas")
S.abrv <- c("DE","DC","FL","GA","MD","NC","SC","VA","WV","AL",
"KY","MS","TN","AR","LA","OK","TX")
S.ref <- c(S.name,S.abrv)
W.name <- c("Arizona","Colorado","Idaho","New Mexico","Montana",
"Utah","Nevada","Wyoming","Alaska","California",
"Hawaii","Oregon","Washington")
W.abrv <- c("AZ","CO","ID","NM","MT","UT","NV","WY","AK","CA",
"HI","OR","WA")
W.ref <- c(W.name,W.abrv)
region.list <- list(
Northeast=NE.ref,
Midwest=MW.ref,
South=S.ref,
West=W.ref)
joined$Region<-sapply(joined$State,
function(x) names(region.list)[grep(x, region.list)])
joined$Region<-as.factor(joined$Region)
contrasts(joined$Region)
## Northeast South West
## Midwest 0 0 0
## Northeast 1 0 0
## South 0 1 0
## West 0 0 1
mod2<-lm(Average.Value~Region, data = joined)
anova(mod2)
## Analysis of Variance Table
##
## Response: Average.Value
## Df Sum Sq Mean Sq F value Pr(>F)
## Region 3 49.445 16.4818 15.329 3.148e-09 ***
## Residuals 266 286.003 1.0752
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(mod2)
##
## Call:
## lm(formula = Average.Value ~ Region, data = joined)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.77147 -0.42147 0.00825 0.66354 3.07853
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.5865 0.1222 62.081 < 2e-16 ***
## RegionNortheast 0.6606 0.1867 3.539 0.000474 ***
## RegionSouth -0.5449 0.1767 -3.084 0.002258 **
## RegionWest 0.3350 0.1695 1.977 0.049083 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.037 on 266 degrees of freedom
## Multiple R-squared: 0.1474, Adjusted R-squared: 0.1378
## F-statistic: 15.33 on 3 and 266 DF, p-value: 3.148e-09
ggplot(mod2, aes(x = Region, y = Average.Value, col = as.factor(Region)))+
geom_boxplot()
mod3<-lm(Average.Value~Population+Region, data = joined)
anova(mod3)
## Analysis of Variance Table
##
## Response: Average.Value
## Df Sum Sq Mean Sq F value Pr(>F)
## Population 1 6.237 6.2373 6.0209 0.01478 *
## Region 3 54.687 18.2290 17.5966 1.903e-10 ***
## Residuals 265 274.524 1.0359
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(mod3)
##
## Call:
## lm(formula = Average.Value ~ Population + Region, data = joined)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.62548 -0.36183 -0.08396 0.46133 3.03082
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.428e+00 1.291e-01 57.536 < 2e-16 ***
## Population 3.536e-08 1.062e-08 3.329 0.000996 ***
## RegionNortheast 6.401e-01 1.833e-01 3.492 0.000562 ***
## RegionSouth -6.298e-01 1.753e-01 -3.593 0.000390 ***
## RegionWest 3.315e-01 1.663e-01 1.993 0.047277 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.018 on 265 degrees of freedom
## Multiple R-squared: 0.1816, Adjusted R-squared: 0.1693
## F-statistic: 14.7 on 4 and 265 DF, p-value: 7.337e-11
ggplot(joined, aes(x = Population, y = Average.Value, col = as.factor(Region)))+
geom_point()+
geom_abline(intercept = mod3$coefficients[1], slope = mod3$coefficients[2], col = "coral", lwd = 1)+
geom_abline(intercept = mod3$coefficients[1]+mod3$coefficients[3], slope = mod3$coefficients[2], col = "darkolivegreen4", lwd = 1)+
geom_abline(intercept = mod3$coefficients[1]+mod3$coefficients[4], slope = mod3$coefficients[2], col = "cyan3", lwd = 1)+
geom_abline(intercept = mod3$coefficients[1]+mod3$coefficients[5], slope = mod3$coefficients[2], col = "purple", lwd = 1)
Northeast yhat = .00000003536x+8.0681
South yhat = .00000003536x+6.7982
West yhat = .00000003536x+7.7595
mod4<-lm(Average.Value~Region*Population, data = joined)
summary(mod4)
##
## Call:
## lm(formula = Average.Value ~ Region * Population, data = joined)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.52852 -0.38298 -0.03411 0.43738 2.99913
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.329e+00 2.083e-01 35.176 < 2e-16 ***
## RegionNortheast 1.036e+00 2.852e-01 3.632 0.000338 ***
## RegionSouth -4.028e-01 2.803e-01 -1.437 0.151878
## RegionWest 3.226e-01 2.470e-01 1.306 0.192592
## Population 5.738e-08 3.811e-08 1.506 0.133301
## RegionNortheast:Population -8.060e-08 4.687e-08 -1.720 0.086683 .
## RegionSouth:Population -4.060e-08 4.322e-08 -0.939 0.348392
## RegionWest:Population 1.466e-09 4.086e-08 0.036 0.971397
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.008 on 262 degrees of freedom
## Multiple R-squared: 0.2068, Adjusted R-squared: 0.1856
## F-statistic: 9.757 on 7 and 262 DF, p-value: 8.454e-11
anova(mod4)
## Analysis of Variance Table
##
## Response: Average.Value
## Df Sum Sq Mean Sq F value Pr(>F)
## Region 3 49.445 16.4818 16.2288 1.051e-09 ***
## Population 1 11.479 11.4791 11.3029 0.0008893 ***
## Region:Population 3 8.440 2.8133 2.7701 0.0421161 *
## Residuals 262 266.084 1.0156
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(mod4, aes(x = Population, y = Average.Value, col = as.factor(Region)))+
geom_point()+
geom_abline(intercept = mod4$coefficients[1], slope = mod4$coefficients[5], col = "coral", lwd = 1)+
geom_abline(intercept = mod4$coefficients[1]+mod4$coefficients[2], slope = mod4$coefficients[5]+mod4$coefficients[6], col = "darkolivegreen4", lwd = 1)+
geom_abline(intercept = mod4$coefficients[1]+mod4$coefficients[3], slope = mod4$coefficients[5]+mod4$coefficients[7], col = "cyan3", lwd = 1)+
geom_abline(intercept = mod4$coefficients[1]+mod4$coefficients[4], slope = mod4$coefficients[5]+mod4$coefficients[8], col = "purple", lwd = 1)
Estimated Models Midwest (Baseline) yhat = .00000005738x+7.329
Northeast yhat = -0.0000000x+8.365
South yhat = .00000001678x+6.9232
West yhat = .000000058846x+7.6516
We learned that when looking at how population affects all the regions’ average minimum wage, there is a significance, but each of the population slopes in each region show that there is no significance.