library("stringr")
library("DT")
library("data.table")
library("ggplot2")
library("corrplot")
library("plotly")
library("knitr")
library("ggplot2")
library("corrplot")
library("rvest")
library("dplyr")
library("tidyr")

Overview

In this assignement we need to take 3 untidy data sets that our classmates provided to use and perform the analysis requested

Massey Rating

Leland Randles asked does offense or defense matter more when it comes to Massey Ratings for college football teams. He provided the dataset at the following location: http://www.masseyratings.com/cf/years.txt

Aquire Data

file <- readLines("Massey.txt")

team_v <- vector()
year_v <-vector()
offense_v <- vector()
defense_v <- vector()
rating_v <- vector()

# Skip first 9 rows 
file <- file[9: length(file)]
file_rows <- length(file)

# Loop through file 
outer_counter <- 0
while (outer_counter < 69 ) { #File has 69 years in it
  year <-  as.factor(trimws(file[[1]])) 
  
  # Grab top 10 teams for the year and loop througth them
  chunk <- file [7:16]
  for (inner_counter in 1:10) {
      
    team_v<-  c(team_v,    trimws(substr(chunk[[inner_counter]],4,20))) 
    offense_v <- c(offense_v, as.numeric( trimws(substr(chunk[[inner_counter]],47,49)))) 
    defense_v <- c(defense_v, as.numeric( trimws(substr(chunk[[inner_counter]],53,55)))) 
    rating_v <-  c (rating_v, as.numeric( trimws(substr(chunk[[inner_counter]],65,70)))) 
    year_v <- c(year_v, year) 
  
  } # end for
  outer_counter  <<- outer_counter + 1
  file <- file [18:length(file)]
 
} #end while 

massey_ratings <- data.frame( year = year_v,
                       team=team_v, 
                       offense = offense_v,  
                       defense = defense_v,
                       rating = rating_v )

datatable(massey_ratings)

Check Correlations

corrM <- cor(massey_ratings[,c("offense","defense", "rating")])
kable(corrM)
offense defense rating
offense 1.0000000 -0.3506145 -0.312836
defense -0.3506145 1.0000000 -0.231722
rating -0.3128360 -0.2317220 1.000000
corrplot(corrM, type="upper", method = "ellipse")

ggplot(massey_ratings, aes( x = defense, y= rating)) +geom_point() + ggtitle( "Defense vs Ratings") + theme(  text = element_text(size=6))

ggplot(massey_ratings, aes( x = offense, y= rating)) +geom_point() + ggtitle("Offense vs Ratings") + theme(  text = element_text(size=6))

Conculsions

There really does not seem to be any correlation between either offense or defense and rating therefore I can not say that either of them matters more.

Walking Dead Viewership

Walt Wells provided a link to the Wikipedia page for Walking Dead Viewership by Episode: https://en.wikipedia.org/wiki/Template:The_Walking_Dead_ratings

Most TV shows gets cancelled when viewership starts to fall. Therefore I am asking did viewership for this show fall over the last two seasons?

Acquire Data

#Read table from wikipedia

#Begin code based on https://codedump.io/share/Unw54jjrt1cG/1/web-scraping-using-rvest-on-a-tennis-table-from-wiki

walking_dead_url <- "https://en.wikipedia.org/wiki/Template:The_Walking_Dead_ratings"
html_tables  <- walking_dead_url %>%   
  read_html() %>% 
  html_nodes( ".wikitable") %>% 
  html_table(fill = TRUE, header = TRUE ) 

#End code based on https://codedump.io/share/Unw54jjrt1cG/1/web-scraping-using-rvest-on-a-tennis-table-from-wiki

data <- html_tables [[2]]
kable(head(data))
Season Episode number Episode number Episode number Episode number Episode number Episode number Episode number Episode number Episode number Episode number Episode number Episode number Episode number Episode number Episode number Episode number
Season 1.000 2.000 3.000 4.000 5.00 6.000 7 8 9 10 11 12 13 14 15 16
1 5.350 4.710 5.070 4.750 5.56 5.970
2 7.260 6.700 6.100 6.290 6.12 6.080 6.620 8.100 6.890 7.040 6.770 6.890 8.990
3 10.970 9.550 10.510 9.270 10.37 9.210 10.430 10.480 12.260 11.050 11.010 11.300 11.460 10.840 10.990 12.420
4 16.111 13.950 12.920 13.310 12.20 12.000 11.290 12.050 15.760 13.340 13.120 12.610 12.650 12.870 13.470 15.680
5 17.300 15.143 13.801 14.518 13.53 14.068 13.330 14.807 15.643 12.267 13.438 14.430 14.534 13.781 13.757 15.784

Tidy Data

#Replace column headings with first row of data
colnames(data) <- c("season", 1:(ncol(data)-1 ))

#Rotate data to make it "Long"
data <- data[-1,] %>% 
  gather( "episode", "viewers", 2:ncol(data))  %>% 
  filter( viewers != "—")

kable(head(data))
season episode viewers
1 1 5.35
2 1 7.26
3 1 10.97
4 1 16.111
5 1 17.3
6 1 14.633
#Cast data to proper types
data$season <- as.factor(data$season)
data$episode <- as.numeric(data$episode)
data$viewers <- as.numeric(data$viewers)

#Add the change in viewership from episode to episode and a 
#column that has season and episode concatenated to bettter display graph
data <- data %>%
  arrange( season, episode) %>%
  mutate( delta_viewers = viewers- lag(viewers) ) %>%
  mutate( season_episode = paste(season, "-", episode, sep="") )
 
data[1,"delta_viewers"] <- data[1,"viewers"]

kable(head(data))
season episode viewers delta_viewers season_episode
1 1 5.35 5.35 1-1
1 2 4.71 -0.64 1-2
1 3 5.07 0.36 1-3
1 4 4.75 -0.32 1-4
1 5 5.56 0.81 1-5
1 6 5.97 0.41 1-6

Analyze Data

#plot data as it is on Wikipedia 
ggplot(data, aes(x=season_episode, y=viewers, fill=season)) + 
  geom_bar(stat="identity") +
  theme(  text = element_text(size=6), axis.text.x = element_text(angle=90))  +
  ggtitle ("Walking dead viewership per episode in millions")

#Get just the last 2 seasons
last_two_seasons <- filter(data, season %in% c(5,6))

#Average the change in viewership per episode over the last two seasons
average_delta_per_episode <- sum(last_two_seasons$delta_viewers) / length(last_two_seasons)

The average viewership chnaged by -0.3 millions per episode over the last 2 seasons. Therefore I would confirm my hypothesis that viewership declined therefore the walking dead was cancelled. Also note that in the graph the last 2 episodes had an uptick in viewership therefore if you through out those two episodes the average decline per episode would be greater. It is common that the series finale would have an uptick in viewership.

Pittsburgh Air Quality

I (Raphael Nash) submitted a data set on the air quality in Pittsburgh, PA. In Pittsburgh durring the summer we have days that are “aire quaility action days”. Durring these days anyone with respritory problems should stay indoors. The worst pollutant in Pittsburgh is called PM2.5 otherwise know as fine particulates, mostly from coal fired industry. I would like to check if PM2.5 is higer in the summer then the winter. I found a data set at: https://data.wprdc.org/dataset/allegheny-county-air-quality/resource/967f1285-f8fb-4785-9673-64a8ae47588d . This dataset only includes data from the dates 20160115 - 20161001 .

Aquire Data

pgh_air_quality <- read.csv("pgh_air_quality.csv", stringsAsFactors = FALSE)
pgh_air_quality$date <- as.Date(pgh_air_quality$datetime)
pgh_air_quality$site <- as.factor(pgh_air_quality$site )
kable(head(pgh_air_quality))
X_id site datetime stat BCSTAT BCSTAT_txt BP BP_txt CO CO_txt H2S H2S_txt INT_T INT_T_txt NO NO_txt NO2 NO2_txt NOX NOX_txt NOY NOY_txt NOYDIF NOYDIF_txt OUT_RH OUT_RH_txt OUT_T OUT_T_txt OZONE OZONE_txt OZONE2 OZONE2_txt PER_F PER_F_txt PER_F2 PER_F2_txt PM10 PM10_txt PM10_FL PM10_FL_txt PM10B PM10B_txt PM25 PM25_txt PM25.2. PM25.2._txt PM25_FL PM25_FL_txt PM25B PM25B_txt PM25T PM25T_txt RWD RWD_txt RWS RWS_txt SIGTHETA SIGTHETA_txt SO2 SO2_txt SONICWD SONICWD_txt SONICWS SONICWS_txt UVPM UVPM_txt date
577 Avalon 2016-01-15T00:00:00 Count NA NA 24.0 NA NA NA 24 24.0 NA NA NA NA NA NA NA NA NA NA NA NA NA 24.0 NA NA NA NA NA NA NA NA NA NA NA NA NA NA 24 NA NA NA NA NA NA 24.0 NA 24.000 24 NA 24.0 NA NA NA 2016-01-15
578 Avalon 2016-01-15T00:00:00 Total NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 2016-01-15
8 Avalon 2016-01-30T00:00:00 Total NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 2016-01-30
579 Avalon 2016-01-15T00:00:00 Min NA NA 726.8 NA NA NA 0 24.1 NA NA NA NA NA NA NA NA NA NA NA NA NA 0.5 NA NA NA NA NA NA NA NA NA NA NA NA NA NA 2 NA NA NA NA NA NA 12.4 NA 0.000 6 NA 1.5 NA NA NA 2016-01-15
580 Avalon 2016-01-15T00:00:00 Max NA NA 736.4 NA NA NA 0 25.8 NA NA NA NA NA NA NA NA NA NA NA NA NA 10.1 NA NA NA NA NA NA NA NA NA NA NA NA NA NA 19 NA NA NA NA NA NA 56.1 NA 0.003 192 NA 6.5 NA NA NA 2016-01-15
1831 Lawrenceville 2 2016-02-16T00:00:00 Count NA NA NA NA 22 NA NA 24.0 NA 21 NA NA NA NA NA 21 NA 21 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 22.000 NA NA NA NA NA NA 2016-02-16

Prepare Data for PM2.5

#Subset only data related to PM2.5 Particles
pgh_pm25 <- pgh_air_quality[, c("site", "date", "stat", "PM25")]
pgh_pm25 <- drop_na(pgh_pm25,  PM25)

kable(head(pgh_pm25))
site date stat PM25
274 Lincoln 2016-02-21 Avg 30
373 Liberty 2 2016-01-15 Count 24
375 Liberty 2 2016-01-15 Min 2
376 Liberty 2 2016-01-15 Max 43
377 Liberty 2 2016-01-15 Hr. of Max 0
378 Liberty 2 2016-01-15 Avg 16
#Rotate data so that it is one observation per row
#an observation is a measurement at a site on a day
pgh_pm25 <- spread(pgh_pm25, stat, PM25) 

#Add Month as Month Number
pgh_pm25 <- mutate(pgh_pm25, month= as.numeric(format(date,"%m"))) 

kable(head(pgh_pm25))
site date Avg Count Hr. of Max Max Min month
Liberty 2 2016-01-15 16 24 0 43 2 1
Liberty 2 2016-01-16 4 24 9 12 2 1
Liberty 2 2016-01-17 4 24 19 8 2 1
Liberty 2 2016-01-18 4 24 18 6 3 1
Liberty 2 2016-01-19 5 24 22 13 3 1
Liberty 2 2016-01-20 9 24 5 15 4 1
#Average all daily max readings for a month
pgh_pm25_avg_max <-  pgh_pm25  %>%
  drop_na(Max) %>% 
  group_by(site,month) %>% 
  summarise(avg_max= mean(Max))

kable(head(pgh_pm25_avg_max))
site month avg_max
Liberty 2 1 22.10000
Liberty 2 2 19.82759
Liberty 2 3 26.87097
Liberty 2 4 29.06667
Liberty 2 5 32.48387
Liberty 2 6 30.70000

Analyze Data for PM2.5

ggplot(pgh_pm25_avg_max, aes(x= as.factor(month), y = avg_max, group = site, colour= site)) + geom_line()  + xlab("Month") + ylab("Avg Max PM2.5") + ggtitle("Avg Daily Max PM2.5B Particles for the by month for 20160115 - 20161001")

I found it intresting that “Liberty 2” shows a clear seasonal pattern, but at the Lincoln site PM2.5 particles keep on increasing. So I found more documentation at http://www.achd.net/air/publiccomment2014/netrev2014draft.pdf on the Allegheny County Health Dept (ACHD) Air Quality monitoring site. This documnet notes that Lincoln and Liberty 2 sites are down wind of USS Clariton Works, a Coke processing plant. Coke is made from coal and is the raw fuel for steel blast furnaces. It would be intresting to see if there is an issue at USS Clariton thhat is causing the uptrend in PM2.5.

Per the daily summaries that the ACHD puts out (http://www.achd.net/airqual/DailySummary.PDF), it looks like PM2.5B is the better measure. It is measured at 2 sites one in Lawernceville, a neighborhood in Pittsburgh that is undergoing a revitilization, and at a site in Avolon which is 6 miles down the Ohio river from downtown Pittsburgh, and a largley residential borough.

Prepare Data for PM2.5B

#Subset only data related to PM2.5 Particles
pgh_pm25B <- pgh_air_quality[, c("site", "date", "stat", "PM25B")]
pgh_pm25B <- drop_na(pgh_pm25B,  PM25B)

kable(head(pgh_pm25B))
site date stat PM25B
1 Avalon 2016-01-15 Count 24
4 Avalon 2016-01-15 Min 2
5 Avalon 2016-01-15 Max 19
7 Avalon 2016-01-15 Hr. of Max 17
8 Avalon 2016-01-15 Avg 11
9 Avalon 2016-01-16 Count 24
#Rotate data so that it is one observation per row
#an observation is a measurement at a site on a day
pgh_pm25B <- spread(pgh_pm25B, stat, PM25B) 

#Add Month as Month Number
pgh_pm25B <- mutate(pgh_pm25B, month= as.numeric(format(date,"%m"))) 

kable(head(pgh_pm25B))
site date Avg Count Hr. of Max Max Min month
Avalon 2016-01-15 11 24 17 19 2 1
Avalon 2016-01-16 7 24 8 22 1 1
Avalon 2016-01-17 5 24 22 12 3 1
Avalon 2016-01-18 7 24 23 11 4 1
Avalon 2016-01-19 10 24 13 22 5 1
Avalon 2016-01-20 16 24 8 22 10 1
#Average all daily max readings for a month
pgh_pm25B_avg_max <-  pgh_pm25B  %>%
  drop_na(Max) %>% 
  group_by(site,month) %>% 
  summarise(avg_max= mean(Max))

kable(head(pgh_pm25B_avg_max))
site month avg_max
Avalon 1 18.90000
Avalon 2 17.82759
Avalon 3 20.19355
Avalon 4 19.33333
Avalon 5 19.54839
Avalon 6 23.26667

Analyze Data for PM2.5B

ggplot(pgh_pm25B_avg_max, aes(x= as.factor(month), y = avg_max, group = site, colour= site)) + geom_line()  + xlab("Month") + ylab("Avg Max PM2.5B") + ggtitle("Avg Daily Max PM2.5B Particles for the by month for 20160115 - 20161001")

Pittsburgh Air Quality Conclusions

The graph of the Avalon and Lawrenceville, show a clear peak in the summer months. However, the Lawerenceville graph shows a spike of PM2.5 in January. It would be intresting see if there where any construction projects going on in January as Lawerenceville is a quickly gentrifying neighborhood.

Also, One should be cautious about reading too much into theese graphs as then only contain data for less then 1 year.