library(tidyverse)
library(car)
library(janitor)
library(ggcorrplot)
library(dotwhisker)
bball <- read_csv("https://raw.githubusercontent.com/ryanburge/pls2003_sp17/master/bball.csv")

What team won the most games in a season in the dataset? What year was that?

bball %>% select(team_id,w,year) %>% arrange(-w)
## # A tibble: 854 x 3
##    team_id     w  year
##      <chr> <int> <int>
##  1     SEA   116  2001
##  2     NYA   114  1998
##  3     NYN   108  1986
##  4     ATL   106  1998
##  5     SLN   105  2004
##  6     ATL   104  1993
##  7     OAK   104  1988
##  8     ATL   103  1999
##  9     NYA   103  2002
## 10     NYA   103  2009
## # ... with 844 more rows

What did scored the most runs in a single season? What year was that?

 bball %>% select(team_id,r,year) %>% arrange(-r)
## # A tibble: 854 x 3
##    team_id     r  year
##      <chr> <int> <int>
##  1     CLE  1009  1999
##  2     SEA   993  1996
##  3     CHA   978  2000
##  4     COL   968  2000
##  5     NYA   968  2007
##  6     NYA   965  1998
##  7     BOS   961  2003
##  8     COL   961  1996
##  9     CLE   952  1996
## 10     CLE   950  2000
## # ... with 844 more rows

What team hit the most home runs + triples (combined) in a single season. What year was that?

bball<-bball %>% mutate(hrtriple=hr+triple)
bball %>% select(team_id,hrtriple,year) %>% arrange(-hrtriple)
## # A tibble: 854 x 3
##    team_id hrtriple  year
##      <chr>    <int> <int>
##  1     TEX      289  2005
##  2     BAL      286  1996
##  3     HOU      285  2000
##  4     SEA      285  1997
##  5     COL      279  1997
##  6     BOS      278  2003
##  7     TOR      278  2010
##  8     SFN      275  2001
##  9     TEX      275  2003
## 10     COL      274  2001
## # ... with 844 more rows

Question 2

What team averaged the most wins per season in the dataset?

bball %>% group_by(team_id) %>% summarise(meanw=mean(w))%>% arrange(-meanw)
## # A tibble: 34 x 2
##    team_id    meanw
##      <chr>    <dbl>
##  1     LAA 90.09091
##  2     NYA 89.90000
##  3     ATL 86.36667
##  4     BOS 85.76667
##  5     SLN 85.26667
##  6     OAK 84.13333
##  7     SFN 83.76667
##  8     LAN 83.40000
##  9     TOR 81.76667
## 10     NYN 81.40000
## # ... with 24 more rows

Which league (AL or NL) has a higher level of offense since 2000?

bball %>% filter(league_id=="AL") %>% filter(year>1999) %>% group_by(league_id) %>% summarise(meanr=mean(r))
## # A tibble: 1 x 2
##   league_id    meanr
##       <chr>    <dbl>
## 1        AL 760.8108
bball %>% filter(league_id=="NL") %>% filter(year>1999) %>% group_by(league_id) %>% summarise(meanr=mean(r))
## # A tibble: 1 x 2
##   league_id    meanr
##       <chr>    <dbl>
## 1        NL 719.7628

Does one league pay their player better than the other league since 2005?

bball %>% filter(league_id=="NL") %>% filter(year>=2005) %>% group_by(league_id) %>% summarise(meanr=mean(salary)) 
## # A tibble: 1 x 2
##   league_id   meanr
##       <chr>   <dbl>
## 1        NL 3218701
bball %>% filter(league_id=="AL") %>% filter(year>=2005) %>% group_by(league_id) %>% summarise(meanr=mean(salary))
## # A tibble: 1 x 2
##   league_id   meanr
##       <chr>   <dbl>
## 1        AL 3578627

Question 3

Is there a relationship between amount a team pays in salary and how many wins they achieve? Visualize this result with points and a line.

bball %>% summarise(cor(w, salary))
## # A tibble: 1 x 1
##   `cor(w, salary)`
##              <dbl>
## 1         0.269302
 bball %>% ggplot(.,aes(x=w, y=salary)) + geom_point()+ geom_smooth(method = lm)

Is there a relationship between amount of wins a team achieves and their attendance for the year? Again, visualize the result.

 bball %>% summarise(cor(w, attendance))
## # A tibble: 1 x 1
##   `cor(w, attendance)`
##                  <dbl>
## 1            0.4959296
bball %>% ggplot(.,aes(x=w, y=attendance)) + geom_point()+ geom_smooth(method = lm)

Does this relationship (between wins and attendance) change if the team is in the American league vs the National League? Visualize this.

bball %>% filter(league_id=="NL") %>%  summarise(cor(w, attendance))
## # A tibble: 1 x 1
##   `cor(w, attendance)`
##                  <dbl>
## 1            0.4536521
bball %>% filter(league_id=="NL") %>% ggplot(.,aes(x=w, y=attendance)) + geom_point()+ geom_smooth(method = lm)

bball %>% filter(league_id=="AL") %>%  summarise(cor(w, attendance))
## # A tibble: 1 x 1
##   `cor(w, attendance)`
##                  <dbl>
## 1            0.5497723
bball %>% filter(league_id=="AL") %>% ggplot(.,aes(x=w, y=attendance)) + geom_point()+ geom_smooth(method = lm)

bball %>% group_by(league_id) %>% summarise(cor(w, attendance))
## # A tibble: 2 x 2
##   league_id `cor(w, attendance)`
##       <chr>                <dbl>
## 1        AL            0.5497723
## 2        NL            0.4536521
bball %>% ggplot(.,aes(x=w, y=attendance, color= as.factor(league_id))) + geom_point() + geom_smooth(method = lm)

Question 4

house <- read_csv("https://raw.githubusercontent.com/ryanburge/pls2003_sp17/master/house.csv")

Construct a regression model that uses “price” as the dependent variable. One independent variable needs to be the age of home. The other four can be your choice.

house<-house %>% mutate(ageofhome=2017-yr_built)
reg1 <- lm(price ~ ageofhome + bedrooms + bathrooms+ condition + floors, data = house)
summary(reg1)
## 
## Call:
## lm(formula = price ~ ageofhome + bedrooms + bathrooms + condition + 
##     floors, data = house)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1660737  -167690   -33282   108864  5656240 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -440278.7    15596.7 -28.229  < 2e-16 ***
## ageofhome      3761.8       88.7  42.409  < 2e-16 ***
## bedrooms       5598.1     2580.6   2.169   0.0301 *  
## bathrooms    298416.0     3769.5  79.166  < 2e-16 ***
## condition     17444.2     3386.6   5.151 2.61e-07 ***
## floors        65591.0     4623.5  14.186  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 297900 on 21607 degrees of freedom
## Multiple R-squared:  0.3425, Adjusted R-squared:  0.3424 
## F-statistic:  2252 on 5 and 21607 DF,  p-value: < 2.2e-16

Intepret the regression output.

The results show that all the variables used are statiscally significant.With every addition of age of th ehouse, the value of the price increases by 3761.8. An addition of a bedroom leads to th eincrease of price by 55981 dollars. Also the addition of a bathroom leas to an increase in the value of th ehouse by 298416.0. An increase in the condition of the house by one point leads to increment in price by 17444.2 dollars. Finally, an increase in the number of floors leads to an uncrease in price of the house by 65591.0 dollars.

Question 5

Making a dotwhisker plot of the regression analysis you just conducted.

dwplot(reg1)  + geom_vline(xintercept = 0, colour = "grey50", linetype = 2)

Now, scale your variables properly and make that dotwhisker plot.

house<-house %>% mutate(bd = bedrooms/33, bth = bathrooms/8, age = ageofhome/117, cd=condition/5, fl=floors/3.5)
reg1 <- lm(price ~ age + bd + bth + cd + fl, data = house)
summary(reg1)
## 
## Call:
## lm(formula = price ~ age + bd + bth + cd + fl, data = house)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1660737  -167690   -33282   108864  5656240 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -440279      15597 -28.229  < 2e-16 ***
## age           440135      10378  42.409  < 2e-16 ***
## bd            184739      85160   2.169   0.0301 *  
## bth          2387328      30156  79.166  < 2e-16 ***
## cd             87221      16933   5.151 2.61e-07 ***
## fl            229569      16182  14.186  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 297900 on 21607 degrees of freedom
## Multiple R-squared:  0.3425, Adjusted R-squared:  0.3424 
## F-statistic:  2252 on 5 and 21607 DF,  p-value: < 2.2e-16
dwplot(reg1)  + geom_vline(xintercept = 0, colour = "grey50", linetype = 2)