library("stringr")
library("DT")
library("data.table")
library("ggplot2")
library("corrplot")
library("plotly")
library("knitr")
library("ggplot2")
library("corrplot")
library("rvest")
library("dplyr")
library("tidyr")
In this assignement we need to take 3 untidy data sets that our classmates provided to use and perform the analysis requested
Leland Randles asked does offense or defense matter more when it comes to Massey Ratings for college football teams. He provided the dataset at the following location: http://www.masseyratings.com/cf/years.txt
file <- readLines("Massey.txt")
team_v <- vector()
year_v <-vector()
offense_v <- vector()
defense_v <- vector()
rating_v <- vector()
# Skip first 9 rows
file <- file[9: length(file)]
file_rows <- length(file)
# Loop through file
outer_counter <- 0
while (outer_counter < 69 ) { #File has 69 years in it
year <- as.factor(trimws(file[[1]]))
# Grab top 10 teams for the year and loop througth them
chunk <- file [7:16]
for (inner_counter in 1:10) {
team_v<- c(team_v, trimws(substr(chunk[[inner_counter]],4,20)))
offense_v <- c(offense_v, as.numeric( trimws(substr(chunk[[inner_counter]],47,49))))
defense_v <- c(defense_v, as.numeric( trimws(substr(chunk[[inner_counter]],53,55))))
rating_v <- c (rating_v, as.numeric( trimws(substr(chunk[[inner_counter]],65,70))))
year_v <- c(year_v, year)
} # end for
outer_counter <<- outer_counter + 1
file <- file [18:length(file)]
} #end while
massey_ratings <- data.frame( year = year_v,
team=team_v,
offense = offense_v,
defense = defense_v,
rating = rating_v )
datatable(massey_ratings)
corrM <- cor(massey_ratings[,c("offense","defense", "rating")])
kable(corrM)
| offense | defense | rating | |
|---|---|---|---|
| offense | 1.0000000 | -0.3506145 | -0.312836 |
| defense | -0.3506145 | 1.0000000 | -0.231722 |
| rating | -0.3128360 | -0.2317220 | 1.000000 |
corrplot(corrM, type="upper", method = "ellipse")
ggplot(massey_ratings, aes( x = defense, y= rating)) +geom_point() + ggtitle( "Defense vs Ratings") + theme( text = element_text(size=6))
ggplot(massey_ratings, aes( x = offense, y= rating)) +geom_point() + ggtitle("Offense vs Ratings") + theme( text = element_text(size=6))
There really does not seem to be any correlation between either offense or defense and rating therefore I can not say that either of them matters more.
Walt Wells provided a link to the Wikipedia page for Walking Dead Viewership by Episode: https://en.wikipedia.org/wiki/Template:The_Walking_Dead_ratings
Most TV shows gets cancelled when viewership starts to fall. Therefore I am asking did viewership for this show fall over the last two seasons?
#Read table from wikipedia
#Begin code based on https://codedump.io/share/Unw54jjrt1cG/1/web-scraping-using-rvest-on-a-tennis-table-from-wiki
walking_dead_url <- "https://en.wikipedia.org/wiki/Template:The_Walking_Dead_ratings"
html_tables <- walking_dead_url %>%
read_html() %>%
html_nodes( ".wikitable") %>%
html_table(fill = TRUE, header = TRUE )
#End code based on https://codedump.io/share/Unw54jjrt1cG/1/web-scraping-using-rvest-on-a-tennis-table-from-wiki
data <- html_tables [[2]]
kable(head(data))
| Season | Episode number | Episode number | Episode number | Episode number | Episode number | Episode number | Episode number | Episode number | Episode number | Episode number | Episode number | Episode number | Episode number | Episode number | Episode number | Episode number |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Season | 1.000 | 2.000 | 3.000 | 4.000 | 5.00 | 6.000 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 |
| 1 | 5.350 | 4.710 | 5.070 | 4.750 | 5.56 | 5.970 | — | — | — | — | — | — | — | — | — | — |
| 2 | 7.260 | 6.700 | 6.100 | 6.290 | 6.12 | 6.080 | 6.620 | 8.100 | 6.890 | 7.040 | 6.770 | 6.890 | 8.990 | — | — | — |
| 3 | 10.970 | 9.550 | 10.510 | 9.270 | 10.37 | 9.210 | 10.430 | 10.480 | 12.260 | 11.050 | 11.010 | 11.300 | 11.460 | 10.840 | 10.990 | 12.420 |
| 4 | 16.111 | 13.950 | 12.920 | 13.310 | 12.20 | 12.000 | 11.290 | 12.050 | 15.760 | 13.340 | 13.120 | 12.610 | 12.650 | 12.870 | 13.470 | 15.680 |
| 5 | 17.300 | 15.143 | 13.801 | 14.518 | 13.53 | 14.068 | 13.330 | 14.807 | 15.643 | 12.267 | 13.438 | 14.430 | 14.534 | 13.781 | 13.757 | 15.784 |
#Replace column headings with first row of data
colnames(data) <- c("season", 1:(ncol(data)-1 ))
#Rotate data to make it "Long"
data <- data[-1,] %>%
gather( "episode", "viewers", 2:ncol(data)) %>%
filter( viewers != "—")
kable(head(data))
| season | episode | viewers |
|---|---|---|
| 1 | 1 | 5.35 |
| 2 | 1 | 7.26 |
| 3 | 1 | 10.97 |
| 4 | 1 | 16.111 |
| 5 | 1 | 17.3 |
| 6 | 1 | 14.633 |
#Cast data to proper types
data$season <- as.factor(data$season)
data$episode <- as.numeric(data$episode)
data$viewers <- as.numeric(data$viewers)
#Add the change in viewership from episode to episode and a
#column that has season and episode concatenated to bettter display graph
data <- data %>%
arrange( season, episode) %>%
mutate( delta_viewers = viewers- lag(viewers) ) %>%
mutate( season_episode = paste(season, "-", episode, sep="") )
data[1,"delta_viewers"] <- data[1,"viewers"]
kable(head(data))
| season | episode | viewers | delta_viewers | season_episode |
|---|---|---|---|---|
| 1 | 1 | 5.35 | 5.35 | 1-1 |
| 1 | 2 | 4.71 | -0.64 | 1-2 |
| 1 | 3 | 5.07 | 0.36 | 1-3 |
| 1 | 4 | 4.75 | -0.32 | 1-4 |
| 1 | 5 | 5.56 | 0.81 | 1-5 |
| 1 | 6 | 5.97 | 0.41 | 1-6 |
#plot data as it is on Wikipedia
ggplot(data, aes(x=season_episode, y=viewers, fill=season)) +
geom_bar(stat="identity") +
theme( text = element_text(size=6), axis.text.x = element_text(angle=90)) +
ggtitle ("Walking dead viewership per episode in millions")
#Get just the last 2 seasons
last_two_seasons <- filter(data, season %in% c(5,6))
#Average the change in viewership per episode over the last two seasons
average_delta_per_episode <- sum(last_two_seasons$delta_viewers) / length(last_two_seasons)
The average viewership chnaged by -0.3 millions per episode over the last 2 seasons. Therefore I would confirm my hypothesis that viewership declined therefore the walking dead was cancelled. Also note that in the graph the last 2 episodes had an uptick in viewership therefore if you through out those two episodes the average decline per episode would be greater. It is common that the series finale would have an uptick in viewership.
I (Raphael Nash) submitted a data set on the air quality in Pittsburgh, PA. In Pittsburgh durring the summer we have days that are “aire quaility action days”. Durring these days anyone with respritory problems should stay indoors. The worst pollutant in Pittsburgh is called PM2.5 otherwise know as fine particulates, mostly from coal fired industry. I would like to check if PM2.5 is higer in the summer then the winter. I found a data set at: https://data.wprdc.org/dataset/allegheny-county-air-quality/resource/967f1285-f8fb-4785-9673-64a8ae47588d . This dataset only includes data from the dates 20160115 - 20161001 .
pgh_air_quality <- read.csv("pgh_air_quality.csv", stringsAsFactors = FALSE)
pgh_air_quality$date <- as.Date(pgh_air_quality$datetime)
pgh_air_quality$site <- as.factor(pgh_air_quality$site )
kable(head(pgh_air_quality))
| X_id | site | datetime | stat | BCSTAT | BCSTAT_txt | BP | BP_txt | CO | CO_txt | H2S | H2S_txt | INT_T | INT_T_txt | NO | NO_txt | NO2 | NO2_txt | NOX | NOX_txt | NOY | NOY_txt | NOYDIF | NOYDIF_txt | OUT_RH | OUT_RH_txt | OUT_T | OUT_T_txt | OZONE | OZONE_txt | OZONE2 | OZONE2_txt | PER_F | PER_F_txt | PER_F2 | PER_F2_txt | PM10 | PM10_txt | PM10_FL | PM10_FL_txt | PM10B | PM10B_txt | PM25 | PM25_txt | PM25.2. | PM25.2._txt | PM25_FL | PM25_FL_txt | PM25B | PM25B_txt | PM25T | PM25T_txt | RWD | RWD_txt | RWS | RWS_txt | SIGTHETA | SIGTHETA_txt | SO2 | SO2_txt | SONICWD | SONICWD_txt | SONICWS | SONICWS_txt | UVPM | UVPM_txt | date |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 577 | Avalon | 2016-01-15T00:00:00 | Count | NA | NA | 24.0 | NA | NA | NA | 24 | 24.0 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 24.0 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 24 | NA | NA | NA | NA | NA | NA | 24.0 | NA | 24.000 | 24 | NA | 24.0 | NA | NA | NA | 2016-01-15 | ||||||||||
| 578 | Avalon | 2016-01-15T00:00:00 | Total | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 2016-01-15 | ||||||||||
| 8 | Avalon | 2016-01-30T00:00:00 | Total | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 2016-01-30 | ||||||||||
| 579 | Avalon | 2016-01-15T00:00:00 | Min | NA | NA | 726.8 | NA | NA | NA | 0 | 24.1 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 0.5 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 2 | NA | NA | NA | NA | NA | NA | 12.4 | NA | 0.000 | 6 | NA | 1.5 | NA | NA | NA | 2016-01-15 | ||||||||||
| 580 | Avalon | 2016-01-15T00:00:00 | Max | NA | NA | 736.4 | NA | NA | NA | 0 | 25.8 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 10.1 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 19 | NA | NA | NA | NA | NA | NA | 56.1 | NA | 0.003 | 192 | NA | 6.5 | NA | NA | NA | 2016-01-15 | ||||||||||
| 1831 | Lawrenceville 2 | 2016-02-16T00:00:00 | Count | NA | NA | NA | NA | 22 | NA | NA | 24.0 | NA | 21 | NA | NA | NA | NA | NA | 21 | NA | 21 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 22.000 | NA | NA | NA | NA | NA | NA | 2016-02-16 |
#Subset only data related to PM2.5 Particles
pgh_pm25 <- pgh_air_quality[, c("site", "date", "stat", "PM25")]
pgh_pm25 <- drop_na(pgh_pm25, PM25)
kable(head(pgh_pm25))
| site | date | stat | PM25 | |
|---|---|---|---|---|
| 274 | Lincoln | 2016-02-21 | Avg | 30 |
| 373 | Liberty 2 | 2016-01-15 | Count | 24 |
| 375 | Liberty 2 | 2016-01-15 | Min | 2 |
| 376 | Liberty 2 | 2016-01-15 | Max | 43 |
| 377 | Liberty 2 | 2016-01-15 | Hr. of Max | 0 |
| 378 | Liberty 2 | 2016-01-15 | Avg | 16 |
#Rotate data so that it is one observation per row
#an observation is a measurement at a site on a day
pgh_pm25 <- spread(pgh_pm25, stat, PM25)
#Add Month as Month Number
pgh_pm25 <- mutate(pgh_pm25, month= as.numeric(format(date,"%m")))
kable(head(pgh_pm25))
| site | date | Avg | Count | Hr. of Max | Max | Min | month |
|---|---|---|---|---|---|---|---|
| Liberty 2 | 2016-01-15 | 16 | 24 | 0 | 43 | 2 | 1 |
| Liberty 2 | 2016-01-16 | 4 | 24 | 9 | 12 | 2 | 1 |
| Liberty 2 | 2016-01-17 | 4 | 24 | 19 | 8 | 2 | 1 |
| Liberty 2 | 2016-01-18 | 4 | 24 | 18 | 6 | 3 | 1 |
| Liberty 2 | 2016-01-19 | 5 | 24 | 22 | 13 | 3 | 1 |
| Liberty 2 | 2016-01-20 | 9 | 24 | 5 | 15 | 4 | 1 |
#Average all daily max readings for a month
pgh_pm25_avg_max <- pgh_pm25 %>%
drop_na(Max) %>%
group_by(site,month) %>%
summarise(avg_max= mean(Max))
kable(head(pgh_pm25_avg_max))
| site | month | avg_max |
|---|---|---|
| Liberty 2 | 1 | 22.10000 |
| Liberty 2 | 2 | 19.82759 |
| Liberty 2 | 3 | 26.87097 |
| Liberty 2 | 4 | 29.06667 |
| Liberty 2 | 5 | 32.48387 |
| Liberty 2 | 6 | 30.70000 |
ggplot(pgh_pm25_avg_max, aes(x= as.factor(month), y = avg_max, group = site, colour= site)) + geom_line() + xlab("Month") + ylab("Avg Max PM2.5") + ggtitle("Avg Daily Max PM2.5B Particles for the by month for 20160115 - 20161001")
I found it intresting that “Liberty 2” shows a clear seasonal pattern, but at the Lincoln site PM2.5 particles keep on increasing. So I found more documentation at http://www.achd.net/air/publiccomment2014/netrev2014draft.pdf on the Allegheny County Health Dept (ACHD) Air Quality monitoring site. This documnet notes that Lincoln and Liberty 2 sites are down wind of USS Clariton Works, a Coke processing plant. Coke is made from coal and is the raw fuel for steel blast furnaces. It would be intresting to see if there is an issue at USS Clariton thhat is causing the uptrend in PM2.5.
Per the daily summaries that the ACHD puts out (http://www.achd.net/airqual/DailySummary.PDF), it looks like PM2.5B is the better measure. It is measured at 2 sites one in Lawernceville, a neighborhood in Pittsburgh that is undergoing a revitilization, and at a site in Avolon which is 6 miles down the Ohio river from downtown Pittsburgh, and a largley residential borough.
#Subset only data related to PM2.5 Particles
pgh_pm25B <- pgh_air_quality[, c("site", "date", "stat", "PM25B")]
pgh_pm25B <- drop_na(pgh_pm25B, PM25B)
kable(head(pgh_pm25B))
| site | date | stat | PM25B | |
|---|---|---|---|---|
| 1 | Avalon | 2016-01-15 | Count | 24 |
| 4 | Avalon | 2016-01-15 | Min | 2 |
| 5 | Avalon | 2016-01-15 | Max | 19 |
| 7 | Avalon | 2016-01-15 | Hr. of Max | 17 |
| 8 | Avalon | 2016-01-15 | Avg | 11 |
| 9 | Avalon | 2016-01-16 | Count | 24 |
#Rotate data so that it is one observation per row
#an observation is a measurement at a site on a day
pgh_pm25B <- spread(pgh_pm25B, stat, PM25B)
#Add Month as Month Number
pgh_pm25B <- mutate(pgh_pm25B, month= as.numeric(format(date,"%m")))
kable(head(pgh_pm25B))
| site | date | Avg | Count | Hr. of Max | Max | Min | month |
|---|---|---|---|---|---|---|---|
| Avalon | 2016-01-15 | 11 | 24 | 17 | 19 | 2 | 1 |
| Avalon | 2016-01-16 | 7 | 24 | 8 | 22 | 1 | 1 |
| Avalon | 2016-01-17 | 5 | 24 | 22 | 12 | 3 | 1 |
| Avalon | 2016-01-18 | 7 | 24 | 23 | 11 | 4 | 1 |
| Avalon | 2016-01-19 | 10 | 24 | 13 | 22 | 5 | 1 |
| Avalon | 2016-01-20 | 16 | 24 | 8 | 22 | 10 | 1 |
#Average all daily max readings for a month
pgh_pm25B_avg_max <- pgh_pm25B %>%
drop_na(Max) %>%
group_by(site,month) %>%
summarise(avg_max= mean(Max))
kable(head(pgh_pm25B_avg_max))
| site | month | avg_max |
|---|---|---|
| Avalon | 1 | 18.90000 |
| Avalon | 2 | 17.82759 |
| Avalon | 3 | 20.19355 |
| Avalon | 4 | 19.33333 |
| Avalon | 5 | 19.54839 |
| Avalon | 6 | 23.26667 |
ggplot(pgh_pm25B_avg_max, aes(x= as.factor(month), y = avg_max, group = site, colour= site)) + geom_line() + xlab("Month") + ylab("Avg Max PM2.5B") + ggtitle("Avg Daily Max PM2.5B Particles for the by month for 20160115 - 20161001")
The graph of the Avalon and Lawrenceville, show a clear peak in the summer months. However, the Lawerenceville graph shows a spike of PM2.5 in January. It would be intresting see if there where any construction projects going on in January as Lawerenceville is a quickly gentrifying neighborhood.
Also, One should be cautious about reading too much into theese graphs as then only contain data for less then 1 year.