setwd(“C:/Users/yvette federspiel/Documents/Dodgers.csv”)

library(readr)


read_csv("https://raw.githubusercontent.com/utjimmyx/regression/master/DodgersData.csv")
## Parsed with column specification:
## cols(
##   month = col_character(),
##   day = col_double(),
##   attend = col_double(),
##   day_of_week = col_character(),
##   opponent = col_character(),
##   temp = col_double(),
##   skies = col_character(),
##   day_night = col_character(),
##   cap = col_character(),
##   shirt = col_character(),
##   fireworks = col_character(),
##   bobblehead = col_character()
## )
## # A tibble: 81 x 12
##    month   day attend day_of_week opponent  temp skies day_night cap   shirt
##    <chr> <dbl>  <dbl> <chr>       <chr>    <dbl> <chr> <chr>     <chr> <chr>
##  1 APR      10  56000 Tuesday     Pirates     67 Clear Day       NO    NO   
##  2 APR      11  29729 Wednesday   Pirates     58 Clou~ Night     NO    NO   
##  3 APR      12  28328 Thursday    Pirates     57 Clou~ Night     NO    NO   
##  4 APR      13  31601 Friday      Padres      54 Clou~ Night     NO    NO   
##  5 APR      14  46549 Saturday    Padres      57 Clou~ Night     NO    NO   
##  6 APR      15  38359 Sunday      Padres      65 Clear Day       NO    NO   
##  7 APR      23  26376 Monday      Braves      60 Clou~ Night     NO    NO   
##  8 APR      24  44014 Tuesday     Braves      63 Clou~ Night     NO    NO   
##  9 APR      25  26345 Wednesday   Braves      64 Clou~ Night     NO    NO   
## 10 APR      27  44807 Friday      Nationa~    66 Clear Night     NO    NO   
## # ... with 71 more rows, and 2 more variables: fireworks <chr>,
## #   bobblehead <chr>
#library(car) # Package with Special functions for linear regression
library(readr)
library(lattice) # Graphics Package
library(ggplot2) # Graphical Package

#Create a dataframe with the Dodgers Data
#DodgersData <- read.csv("DodgersData.csv")
DodgersData <- read.csv("DodgersData.csv", fileEncoding="UTF-8-BOM", header=TRUE)

Data cleanup and exploratory analysis

rm(list=ls())# clear memory Evaluate the Structure and Re-Level the factor variables for “Day Of Week”" and “Month”" in the right order

# Check the structure for Dorder Data
str(DodgersData)
## 'data.frame':    81 obs. of  12 variables:
##  $ month      : chr  "APR" "APR" "APR" "APR" ...
##  $ day        : int  10 11 12 13 14 15 23 24 25 27 ...
##  $ attend     : int  56000 29729 28328 31601 46549 38359 26376 44014 26345 44807 ...
##  $ day_of_week: chr  "Tuesday" "Wednesday" "Thursday" "Friday" ...
##  $ opponent   : chr  "Pirates" "Pirates" "Pirates" "Padres" ...
##  $ temp       : int  67 58 57 54 57 65 60 63 64 66 ...
##  $ skies      : chr  "Clear " "Cloudy" "Cloudy" "Cloudy" ...
##  $ day_night  : chr  "Day" "Night" "Night" "Night" ...
##  $ cap        : chr  "NO" "NO" "NO" "NO" ...
##  $ shirt      : chr  "NO" "NO" "NO" "NO" ...
##  $ fireworks  : chr  "NO" "NO" "NO" "YES" ...
##  $ bobblehead : chr  "NO" "NO" "NO" "NO" ...
head(DodgersData)
##   month day attend day_of_week opponent temp  skies day_night cap shirt
## 1   APR  10  56000     Tuesday  Pirates   67 Clear        Day  NO    NO
## 2   APR  11  29729   Wednesday  Pirates   58 Cloudy     Night  NO    NO
## 3   APR  12  28328    Thursday  Pirates   57 Cloudy     Night  NO    NO
## 4   APR  13  31601      Friday   Padres   54 Cloudy     Night  NO    NO
## 5   APR  14  46549    Saturday   Padres   57 Cloudy     Night  NO    NO
## 6   APR  15  38359      Sunday   Padres   65 Clear        Day  NO    NO
##   fireworks bobblehead
## 1        NO         NO
## 2        NO         NO
## 3        NO         NO
## 4       YES         NO
## 5        NO         NO
## 6        NO         NO
# Evaluate the factor levels for day_of_week
# levels(DodgersData$day_of_week)


 

# Evaluate the factor levels for month
levels(DodgersData$month)
## NULL
# First 10 rows of the data frame
head(DodgersData, 10)
##    month day attend day_of_week  opponent temp  skies day_night cap shirt
## 1    APR  10  56000     Tuesday   Pirates   67 Clear        Day  NO    NO
## 2    APR  11  29729   Wednesday   Pirates   58 Cloudy     Night  NO    NO
## 3    APR  12  28328    Thursday   Pirates   57 Cloudy     Night  NO    NO
## 4    APR  13  31601      Friday    Padres   54 Cloudy     Night  NO    NO
## 5    APR  14  46549    Saturday    Padres   57 Cloudy     Night  NO    NO
## 6    APR  15  38359      Sunday    Padres   65 Clear        Day  NO    NO
## 7    APR  23  26376      Monday    Braves   60 Cloudy     Night  NO    NO
## 8    APR  24  44014     Tuesday    Braves   63 Cloudy     Night  NO    NO
## 9    APR  25  26345   Wednesday    Braves   64 Cloudy     Night  NO    NO
## 10   APR  27  44807      Friday Nationals   66 Clear      Night  NO    NO
##    fireworks bobblehead
## 1         NO         NO
## 2         NO         NO
## 3         NO         NO
## 4        YES         NO
## 5         NO         NO
## 6         NO         NO
## 7         NO         NO
## 8         NO         NO
## 9         NO         NO
## 10       YES         NO

Let R identify the temperature, the attendance, the opponent, and the promotion (i.e., bobblehead) for the 20th home game of the season.

DodgersData[20, c("temp", "attend", "opponent", "bobblehead")]
##    temp attend opponent bobblehead
## 20   70  47077   Snakes        YES

Let R identify the average value for attendance.

meanattend <- mean(DodgersData$attend)
meanattend
## [1] 41040.07

Let R identify the number of promotions they have had.

promotions <- sum(DodgersData$bobblehead=="YES")
promotions
## [1] 11
## in-class notes

Note: You may perform the regression analysis using Excel.

If you chose to use R and RStudio, please work on any two of the first three questions (1a, 1b, and 1c) and the last two questions (2 and 3).

If you chose to use Excel, please post your spreadsheet solutions and the answers to Questions 1a, 1b, 1c, 2, and 3.

Q 1a: Let R identify the temperature, the attendance, the opponent, and the promotion (i.e., bobblehead) for the 25th home game of the season. Report your results.

Answer:

Q 1b: What is the median value of attendance? Please review your in-class notes and write your function and answer below.

Answer:

Q 1c: How many night games did the Dodgers have? Please review your in-class notes and write your function and answer below.

Answer:

Q 2: Interpret one of the box plots or scatter plots in plain language.

Answer:

Q 3: Explain the final statistical results in plain language.

Answer: #### Q 4: Please read the tutorial “Advanced topics - Formatting a testable marketing hypothesis.docx” and develop two “draft” hypotheses for your group project.

Answer:

Exploratory analysis

The results show that in 2012 there were a few promotions (see the last four columns)

Cap Shirt Fireworks Bobblehead

We have data from April to October for games played in the Day or Night under Clear or Cloudy Skys.

Dodger Stadium has a capacity of about 56,000. Looking at the entire (sample) data shows that the stadium filled up only twice in 2012. There were only two cap promotions, three shirt promotions - not enough data for any inferences. Fireworks and Bobblehead promotions have happened a few times.

Further more there were eleven bobble head promotions and most of then (six) being on Tuesday nights.

Attendance by Day Of Week/month

Evaluate Attendance by Weather

#Evaluate attendance by weather
ggplot(DodgersData, aes(x=temp, y=attend/1000, color=fireworks)) +
geom_point() +
facet_wrap(day_night~skies) +
ggtitle("Dodgers Attendance By Temperature By Time of Game and Skies") +
theme(plot.title = element_text(lineheight=3, face="bold", color="black", size=10)) +
xlab("Temperature (Degree Farenheit)") +
ylab("Attendance (Thousands)")

Strip Plot of Attendance by opponent or visiting team

#Strip Plot of Attendance by opponent or visiting team
ggplot(DodgersData, aes(x=attend/1000, y=opponent, color=day_night)) +
geom_point() +
ggtitle("Dodgers Attendance By Opponent") +
theme(plot.title = element_text(lineheight=3, face="bold", color="black", size=10)) +
xlab("Attendance (Thousands)") +
ylab("Opponent (Visiting Team)")

##Design Predictive Model

To advise the management if promotions impact attendance we will need to identify if there is a positive effect, and if there is a positive effect how much of an effect it is.

To provide this advice, I built a Linear Model for predicting attendance using Month, Day Of Week and the indicator variable Bobblehead promotion. I split the data into Training and Test to create the model

# Create a model with the bobblehead variable entered last
my.model <- {attend ~ month + day_of_week + bobblehead}

# use the full data set to obtain an estimate of the increase in
# attendance due to bobbleheads, controlling for other factors
my.model.fit <- lm(my.model, data = DodgersData) # use all available data
print(summary(my.model.fit))
## 
## Call:
## lm(formula = my.model, data = DodgersData)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10786.5  -3628.1   -516.1   2230.2  14351.0 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          38792.98    2364.68  16.405  < 2e-16 ***
## monthAUG              2377.92    2402.91   0.990   0.3259    
## monthJUL              2849.83    2578.60   1.105   0.2730    
## monthJUN              7163.23    2732.72   2.621   0.0108 *  
## monthMAY             -2385.62    2291.22  -1.041   0.3015    
## monthOCT              -662.67    4046.45  -0.164   0.8704    
## monthSEP                29.03    2521.25   0.012   0.9908    
## day_of_weekMonday    -4883.82    2504.65  -1.950   0.0554 .  
## day_of_weekSaturday   1488.24    2442.68   0.609   0.5444    
## day_of_weekSunday     1840.18    2426.79   0.758   0.4509    
## day_of_weekThursday  -4108.45    3381.22  -1.215   0.2286    
## day_of_weekTuesday    3027.68    2686.43   1.127   0.2638    
## day_of_weekWednesday -2423.80    2485.46  -0.975   0.3330    
## bobbleheadYES        10714.90    2419.52   4.429 3.59e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6120 on 67 degrees of freedom
## Multiple R-squared:  0.5444, Adjusted R-squared:  0.456 
## F-statistic: 6.158 on 13 and 67 DF,  p-value: 2.083e-07