#1) Run a regression analysis to see if there is evidence that the three variables audience, male and income are useful to predict the pagecost.
##Part A: setting the wd, importing the data, and viewing the data
library(readr)
library(ggplot2)
setwd("~/NYU/classes/4. Statistical Modeling/Week 2")
Magz <- read_csv("MagazineAdCost.csv")
## Rows: 55 Columns: 5
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (1): Magazine
## dbl (4): pagecost, audience, male, income
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(Magz, 10)
## # A tibble: 10 x 5
## Magazine pagecost audience male income
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Audubon 25315 1645 51.1 38787
## 2 Better Homes & Gardens 198000 34797 22.1 41933
## 3 Business Week 103300 4760 68.1 63667
## 4 Cosmopolitan 94100 15452 17.3 44237
## 5 Elle 55540 3735 12.5 47211
## 6 Entrepreneur 40355 2476 60.4 47579
## 7 Esquire 51559 3037 71.3 44715
## 8 Family Circle 147500 24539 13 38759
## 9 First For Women 28059 3856 3.6 43850
## 10 Forbes 59340 4191 68.8 66606
View(Magz)
names(Magz)
## [1] "Magazine" "pagecost" "audience" "male" "income"
## Part B: Take the natural logarithm of ‘Pagecost’.
# Creating a new coloumn named 'log.pagecost' which is natural log of an already existing coloumn 'pagecost'
Magz$log.pagecost <- log(Magz$pagecost)
# Viewing how our new data frame looks
head(Magz, 10)
## # A tibble: 10 x 6
## Magazine pagecost audience male income log.pagecost
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Audubon 25315 1645 51.1 38787 10.1
## 2 Better Homes & Gardens 198000 34797 22.1 41933 12.2
## 3 Business Week 103300 4760 68.1 63667 11.5
## 4 Cosmopolitan 94100 15452 17.3 44237 11.5
## 5 Elle 55540 3735 12.5 47211 10.9
## 6 Entrepreneur 40355 2476 60.4 47579 10.6
## 7 Esquire 51559 3037 71.3 44715 10.9
## 8 Family Circle 147500 24539 13 38759 11.9
## 9 First For Women 28059 3856 3.6 43850 10.2
## 10 Forbes 59340 4191 68.8 66606 11.0
## Part 3:Build the multiple regression model.
# multiple regression model stored in the variable mamed "MagzMultipleRegModel"
MagzMultipleRegModel<- lm(Magz$log.pagecost ~ audience + male + income, data= Magz)
#getting the summary
summary(MagzMultipleRegModel)
##
## Call:
## lm(formula = Magz$log.pagecost ~ audience + male + income, data = Magz)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.72776 -0.27050 0.05691 0.26717 0.50175
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.895e+00 2.717e-01 36.425 < 2e-16 ***
## audience 4.417e-05 4.520e-06 9.771 2.82e-13 ***
## male -3.201e-03 2.218e-03 -1.443 0.15509
## income 1.953e-05 5.947e-06 3.284 0.00185 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3472 on 51 degrees of freedom
## Multiple R-squared: 0.6533, Adjusted R-squared: 0.6329
## F-statistic: 32.03 on 3 and 51 DF, p-value: 8.795e-12
# Here’s how you can obtain the anova output if needed:
anova(MagzMultipleRegModel)
## Analysis of Variance Table
##
## Response: Magz$log.pagecost
## Df Sum Sq Mean Sq F value Pr(>F)
## audience 1 10.2560 10.2560 85.0820 1.881e-12 ***
## male 1 0.0260 0.0260 0.2158 0.644198
## income 1 1.2998 1.2998 10.7829 0.001855 **
## Residuals 51 6.1477 0.1205
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##Fitted Plane: y page cost = 4042.7986 + 3.7880 - 123.6343 + 0.9026 (NEED TO EVALUATE THIS)
##Log Fitted Plane = LogPageCost = 9.895 + 4.417 audience - 3.2 male + 1.9 income
#2) If that is indeed the case, which of the three variables are useful in the model, in the presence of the other variables? Are there any magazines in the dataset that have unusually high or low pricing for full page colour ads relative to their audience/male/income profile?
##The P stat for income is less than 5% @ 0.00185, therefore ‘income’ is useful. To undestand the outliers, will look at the following:
##Part 4: Plot the standardized residuals vs. fitted values
# obtaining standard residuals
MagzMultipleRegModel.StdRes <- rstandard(MagzMultipleRegModel)
# obtaining fitted values
MagzMultipleRegModel.Fit <- fitted.values(MagzMultipleRegModel)
# deciding on X and Y axis + plotting the points on graph in blue
p3=ggplot(data=Magz,aes(x=MagzMultipleRegModel.Fit,y=MagzMultipleRegModel.StdRes))+geom_point(color='blue')
# plotting the best fitting line through the points in red
# "labs()" function is used to lable the axis
p3 + geom_smooth( method= 'lm', se= F, col= "red")+ labs(x= "Fitted values", y= "Standard Residuals")
## `geom_smooth()` using formula 'y ~ x'