#1) Run a regression analysis to see if there is evidence that the three variables audience, male and income are useful to predict the pagecost.

##Part A: setting the wd, importing the data, and viewing the data 
library(readr)
library(ggplot2)
setwd("~/NYU/classes/4. Statistical Modeling/Week 2")
Magz <- read_csv("MagazineAdCost.csv")
## Rows: 55 Columns: 5
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (1): Magazine
## dbl (4): pagecost, audience, male, income
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(Magz, 10)
## # A tibble: 10 x 5
##    Magazine               pagecost audience  male income
##    <chr>                     <dbl>    <dbl> <dbl>  <dbl>
##  1 Audubon                   25315     1645  51.1  38787
##  2 Better Homes & Gardens   198000    34797  22.1  41933
##  3 Business Week            103300     4760  68.1  63667
##  4 Cosmopolitan              94100    15452  17.3  44237
##  5 Elle                      55540     3735  12.5  47211
##  6 Entrepreneur              40355     2476  60.4  47579
##  7 Esquire                   51559     3037  71.3  44715
##  8 Family Circle            147500    24539  13    38759
##  9 First For Women           28059     3856   3.6  43850
## 10 Forbes                    59340     4191  68.8  66606
View(Magz)
names(Magz)
## [1] "Magazine" "pagecost" "audience" "male"     "income"
## Part B: Take the natural logarithm of ‘Pagecost’.
# Creating a new coloumn named 'log.pagecost' which is natural log of an already existing coloumn 'pagecost'
Magz$log.pagecost <- log(Magz$pagecost)
# Viewing how our new data frame looks
head(Magz, 10)
## # A tibble: 10 x 6
##    Magazine               pagecost audience  male income log.pagecost
##    <chr>                     <dbl>    <dbl> <dbl>  <dbl>        <dbl>
##  1 Audubon                   25315     1645  51.1  38787         10.1
##  2 Better Homes & Gardens   198000    34797  22.1  41933         12.2
##  3 Business Week            103300     4760  68.1  63667         11.5
##  4 Cosmopolitan              94100    15452  17.3  44237         11.5
##  5 Elle                      55540     3735  12.5  47211         10.9
##  6 Entrepreneur              40355     2476  60.4  47579         10.6
##  7 Esquire                   51559     3037  71.3  44715         10.9
##  8 Family Circle            147500    24539  13    38759         11.9
##  9 First For Women           28059     3856   3.6  43850         10.2
## 10 Forbes                    59340     4191  68.8  66606         11.0
## Part 3:Build the multiple regression model.
# multiple regression model stored in the variable mamed "MagzMultipleRegModel"
MagzMultipleRegModel<- lm(Magz$log.pagecost ~ audience + male + income, data= Magz)
#getting the summary
summary(MagzMultipleRegModel)
## 
## Call:
## lm(formula = Magz$log.pagecost ~ audience + male + income, data = Magz)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.72776 -0.27050  0.05691  0.26717  0.50175 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  9.895e+00  2.717e-01  36.425  < 2e-16 ***
## audience     4.417e-05  4.520e-06   9.771 2.82e-13 ***
## male        -3.201e-03  2.218e-03  -1.443  0.15509    
## income       1.953e-05  5.947e-06   3.284  0.00185 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3472 on 51 degrees of freedom
## Multiple R-squared:  0.6533, Adjusted R-squared:  0.6329 
## F-statistic: 32.03 on 3 and 51 DF,  p-value: 8.795e-12
# Here’s how you can obtain the anova output if needed:
anova(MagzMultipleRegModel)
## Analysis of Variance Table
## 
## Response: Magz$log.pagecost
##           Df  Sum Sq Mean Sq F value    Pr(>F)    
## audience   1 10.2560 10.2560 85.0820 1.881e-12 ***
## male       1  0.0260  0.0260  0.2158  0.644198    
## income     1  1.2998  1.2998 10.7829  0.001855 ** 
## Residuals 51  6.1477  0.1205                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

##Fitted Plane: y page cost = 4042.7986 + 3.7880 - 123.6343 + 0.9026 (NEED TO EVALUATE THIS)

##Log Fitted Plane = LogPageCost = 9.895 + 4.417 audience - 3.2 male + 1.9 income

#2) If that is indeed the case, which of the three variables are useful in the model, in the presence of the other variables? Are there any magazines in the dataset that have unusually high or low pricing for full page colour ads relative to their audience/male/income profile?

##The P stat for income is less than 5% @ 0.00185, therefore ‘income’ is useful. To undestand the outliers, will look at the following:

##Part 4: Plot the standardized residuals vs. fitted values
# obtaining standard residuals
MagzMultipleRegModel.StdRes <- rstandard(MagzMultipleRegModel)
# obtaining fitted values
MagzMultipleRegModel.Fit <- fitted.values(MagzMultipleRegModel)
# deciding on X and Y axis + plotting the points on graph in blue
p3=ggplot(data=Magz,aes(x=MagzMultipleRegModel.Fit,y=MagzMultipleRegModel.StdRes))+geom_point(color='blue')
# plotting the best fitting line through the points in red
# "labs()" function is used to lable the axis
p3 + geom_smooth( method= 'lm', se= F, col= "red")+ labs(x= "Fitted values", y= "Standard Residuals")
## `geom_smooth()` using formula 'y ~ x'

based on this, there are no outliers (i.e. no values outside of (-3,3))