library("stringr")
library("DT")
library("data.table")
library("ggplot2")
library("corrplot")
library("plotly")
library("knitr")
library("ggplot2")
library("corrplot")
library("rvest")
library("dplyr")
library("tidyr")

Overview

In this assignement we need to take 3 untidy data sets that our classmates provided to use and perform the analysis requested

Massey Rating

Leland Randles asked does offense or defense matter more when it comes to Massey Ratings for college football teams. He provided the dataset at the following location: http://www.masseyratings.com/cf/years.txt

Aquire Data

file <- readLines("Massey.txt")

team_v <- vector()
year_v <-vector()
offense_v <- vector()
defense_v <- vector()
rating_v <- vector()

# Skip first 9 rows 
file <- file[9: length(file)]
file_rows <- length(file)

# Loop through file 
outer_counter <- 0
while (outer_counter < 69 ) { #File has 69 years in it
  year <-  as.factor(trimws(file[[1]])) 
  
  # Grab top 10 teams for the year and loop througth them
  chunk <- file [7:16]
  for (inner_counter in 1:10) {
      
    team_v<-  c(team_v,    trimws(substr(chunk[[inner_counter]],4,20))) 
    offense_v <- c(offense_v, as.numeric( trimws(substr(chunk[[inner_counter]],47,49)))) 
    defense_v <- c(defense_v, as.numeric( trimws(substr(chunk[[inner_counter]],53,55)))) 
    rating_v <-  c (rating_v, as.numeric( trimws(substr(chunk[[inner_counter]],65,70)))) 
    year_v <- c(year_v, year) 
  
  } # end for
  outer_counter  <<- outer_counter + 1
  file <- file [18:length(file)]
 
} #end while 

massey_ratings <- data.frame( year = year_v,
                       team=team_v, 
                       offense = offense_v,  
                       defense = defense_v,
                       rating = rating_v )

datatable(massey_ratings)

Check Correlations

corrM <- cor(massey_ratings[,c("offense","defense", "rating")])
kable(corrM)

	offense	defense	rating
offense	1.0000000	-0.3506145	-0.312836
defense	-0.3506145	1.0000000	-0.231722
rating	-0.3128360	-0.2317220	1.000000

corrplot(corrM, type="upper", method = "ellipse")

ggplot(massey_ratings, aes( x = defense, y= rating)) +geom_point() + ggtitle( "Defense vs Ratings") + theme(  text = element_text(size=6))

ggplot(massey_ratings, aes( x = offense, y= rating)) +geom_point() + ggtitle("Offense vs Ratings") + theme(  text = element_text(size=6))

Conculsions

There really does not seem to be any correlation between either offense or defense and rating therefore I can not say that either of them matters more.

Walking Dead Viewership

Walt Wells provided a link to the Wikipedia page for Walking Dead Viewership by Episode: https://en.wikipedia.org/wiki/Template:The_Walking_Dead_ratings

Most TV shows gets cancelled when viewership starts to fall. Therefore I am asking did viewership for this show fall over the last two seasons?

Acquire Data

#Read table from wikipedia

#Begin code based on https://codedump.io/share/Unw54jjrt1cG/1/web-scraping-using-rvest-on-a-tennis-table-from-wiki

walking_dead_url <- "https://en.wikipedia.org/wiki/Template:The_Walking_Dead_ratings"
html_tables  <- walking_dead_url %>%   
  read_html() %>% 
  html_nodes( ".wikitable") %>% 
  html_table(fill = TRUE, header = TRUE ) 

#End code based on https://codedump.io/share/Unw54jjrt1cG/1/web-scraping-using-rvest-on-a-tennis-table-from-wiki

data <- html_tables [[2]]
kable(head(data))

Season	Episode number	Episode number	Episode number	Episode number	Episode number	Episode number	Episode number	Episode number	Episode number	Episode number	Episode number	Episode number	Episode number	Episode number	Episode number	Episode number
Season	1.000	2.000	3.000	4.000	5.00	6.000	7	8	9	10	11	12	13	14	15	16
1	5.350	4.710	5.070	4.750	5.56	5.970	—	—	—	—	—	—	—	—	—	—
2	7.260	6.700	6.100	6.290	6.12	6.080	6.620	8.100	6.890	7.040	6.770	6.890	8.990	—	—	—
3	10.970	9.550	10.510	9.270	10.37	9.210	10.430	10.480	12.260	11.050	11.010	11.300	11.460	10.840	10.990	12.420
4	16.111	13.950	12.920	13.310	12.20	12.000	11.290	12.050	15.760	13.340	13.120	12.610	12.650	12.870	13.470	15.680
5	17.300	15.143	13.801	14.518	13.53	14.068	13.330	14.807	15.643	12.267	13.438	14.430	14.534	13.781	13.757	15.784

Tidy Data

#Replace column headings with first row of data
colnames(data) <- c("season", 1:(ncol(data)-1 ))

#Rotate data to make it "Long"
data <- data[-1,] %>% 
  gather( "episode", "viewers", 2:ncol(data))  %>% 
  filter( viewers != "—")

kable(head(data))

season	episode	viewers
1	1	5.35
2	1	7.26
3	1	10.97
4	1	16.111
5	1	17.3
6	1	14.633

#Cast data to proper types
data$season <- as.factor(data$season)
data$episode <- as.numeric(data$episode)
data$viewers <- as.numeric(data$viewers)

#Add the change in viewership from episode to episode and a 
#column that has season and episode concatenated to bettter display graph
data <- data %>%
  arrange( season, episode) %>%
  mutate( delta_viewers = viewers- lag(viewers) ) %>%
  mutate( season_episode = paste(season, "-", episode, sep="") )
 
data[1,"delta_viewers"] <- data[1,"viewers"]

kable(head(data))

season	episode	viewers	delta_viewers	season_episode
1	1	5.35	5.35	1-1
1	2	4.71	-0.64	1-2
1	3	5.07	0.36	1-3
1	4	4.75	-0.32	1-4
1	5	5.56	0.81	1-5
1	6	5.97	0.41	1-6

Analyze Data

#plot data as it is on Wikipedia 
ggplot(data, aes(x=season_episode, y=viewers, fill=season)) + 
  geom_bar(stat="identity") +
  theme(  text = element_text(size=6), axis.text.x = element_text(angle=90))  +
  ggtitle ("Walking dead viewership per episode in millions")

#Get just the last 2 seasons
last_two_seasons <- filter(data, season %in% c(5,6))

#Average the change in viewership per episode over the last two seasons
average_delta_per_episode <- sum(last_two_seasons$delta_viewers) / length(last_two_seasons)

The average viewership chnaged by -0.3 millions per episode over the last 2 seasons. Therefore I would confirm my hypothesis that viewership declined therefore the walking dead was cancelled. Also note that in the graph the last 2 episodes had an uptick in viewership therefore if you through out those two episodes the average decline per episode would be greater. It is common that the series finale would have an uptick in viewership.

Pittsburgh Air Quality

I (Raphael Nash) submitted a data set on the air quality in Pittsburgh, PA. In Pittsburgh durring the summer we have days that are “aire quaility action days”. Durring these days anyone with respritory problems should stay indoors. The worst pollutant in Pittsburgh is called PM2.5 otherwise know as fine particulates, mostly from coal fired industry. I would like to check if PM2.5 is higer in the summer then the winter. I found a data set at: https://data.wprdc.org/dataset/allegheny-county-air-quality/resource/967f1285-f8fb-4785-9673-64a8ae47588d . This dataset only includes data from the dates 20160115 - 20161001 .

Aquire Data

pgh_air_quality <- read.csv("pgh_air_quality.csv", stringsAsFactors = FALSE)
pgh_air_quality$date <- as.Date(pgh_air_quality$datetime)
pgh_air_quality$site <- as.factor(pgh_air_quality$site )
kable(head(pgh_air_quality))

X_id	site	datetime	stat	BCSTAT	BCSTAT_txt	BP	BP_txt	CO	CO_txt	H2S	INT_T	INT_T_txt	NO	NO_txt	NO2	NO2_txt	NOX	NOX_txt	NOY	NOY_txt	NOYDIF	NOYDIF_txt	OUT_RH	OUT_RH_txt	OUT_T	OUT_T_txt	OZONE	OZONE2	PER_F	PER_F_txt	PER_F2	PM10	PM10_FL	PM10B	PM25	PM25.2.	PM25.2._txt	PM25_FL	PM25_FL_txt	PM25B	PM25T	PM25T_txt	RWD	RWD_txt	RWS	RWS_txt	SIGTHETA	SIGTHETA_txt	SO2	SONICWD	SONICWD_txt	SONICWS	SONICWS_txt	UVPM	UVPM_txt	date
577	Avalon	2016-01-15T00:00:00	Count	NA	NA	24.0	NA	NA	NA	24	24.0	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	24.0	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	24	NA	NA	NA	NA	NA	NA	24.0	NA	24.000	24	NA	24.0	NA	NA	NA	2016-01-15
578	Avalon	2016-01-15T00:00:00	Total	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	2016-01-15
8	Avalon	2016-01-30T00:00:00	Total	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	2016-01-30
579	Avalon	2016-01-15T00:00:00	Min	NA	NA	726.8	NA	NA	NA	0	24.1	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	0.5	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	2	NA	NA	NA	NA	NA	NA	12.4	NA	0.000	6	NA	1.5	NA	NA	NA	2016-01-15
580	Avalon	2016-01-15T00:00:00	Max	NA	NA	736.4	NA	NA	NA	0	25.8	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	10.1	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	19	NA	NA	NA	NA	NA	NA	56.1	NA	0.003	192	NA	6.5	NA	NA	NA	2016-01-15
1831	Lawrenceville 2	2016-02-16T00:00:00	Count	NA	NA	NA	NA	22	NA	NA	24.0	NA	21	NA	NA	NA	NA	NA	21	NA	21	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	22.000	NA	NA	NA	NA	NA	NA	2016-02-16

Prepare Data for PM2.5

#Subset only data related to PM2.5 Particles
pgh_pm25 <- pgh_air_quality[, c("site", "date", "stat", "PM25")]
pgh_pm25 <- drop_na(pgh_pm25,  PM25)

kable(head(pgh_pm25))

	site	date	stat	PM25
274	Lincoln	2016-02-21	Avg	30
373	Liberty 2	2016-01-15	Count	24
375	Liberty 2	2016-01-15	Min	2
376	Liberty 2	2016-01-15	Max	43
377	Liberty 2	2016-01-15	Hr. of Max	0
378	Liberty 2	2016-01-15	Avg	16

#Rotate data so that it is one observation per row
#an observation is a measurement at a site on a day
pgh_pm25 <- spread(pgh_pm25, stat, PM25) 

#Add Month as Month Number
pgh_pm25 <- mutate(pgh_pm25, month= as.numeric(format(date,"%m"))) 

kable(head(pgh_pm25))

site	date	Avg	Count	Hr. of Max	Max	Min	month
Liberty 2	2016-01-15	16	24	0	43	2	1
Liberty 2	2016-01-16	4	24	9	12	2	1
Liberty 2	2016-01-17	4	24	19	8	2	1
Liberty 2	2016-01-18	4	24	18	6	3	1
Liberty 2	2016-01-19	5	24	22	13	3	1
Liberty 2	2016-01-20	9	24	5	15	4	1

#Average all daily max readings for a month
pgh_pm25_avg_max <-  pgh_pm25  %>%
  drop_na(Max) %>% 
  group_by(site,month) %>% 
  summarise(avg_max= mean(Max))

kable(head(pgh_pm25_avg_max))

site	month	avg_max
Liberty 2	1	22.10000
Liberty 2	2	19.82759
Liberty 2	3	26.87097
Liberty 2	4	29.06667
Liberty 2	5	32.48387
Liberty 2	6	30.70000

Analyze Data for PM2.5

ggplot(pgh_pm25_avg_max, aes(x= as.factor(month), y = avg_max, group = site, colour= site)) + geom_line()  + xlab("Month") + ylab("Avg Max PM2.5") + ggtitle("Avg Daily Max PM2.5B Particles for the by month for 20160115 - 20161001")

I found it intresting that “Liberty 2” shows a clear seasonal pattern, but at the Lincoln site PM2.5 particles keep on increasing. So I found more documentation at http://www.achd.net/air/publiccomment2014/netrev2014draft.pdf on the Allegheny County Health Dept (ACHD) Air Quality monitoring site. This documnet notes that Lincoln and Liberty 2 sites are down wind of USS Clariton Works, a Coke processing plant. Coke is made from coal and is the raw fuel for steel blast furnaces. It would be intresting to see if there is an issue at USS Clariton thhat is causing the uptrend in PM2.5.

Per the daily summaries that the ACHD puts out (http://www.achd.net/airqual/DailySummary.PDF), it looks like PM2.5B is the better measure. It is measured at 2 sites one in Lawernceville, a neighborhood in Pittsburgh that is undergoing a revitilization, and at a site in Avolon which is 6 miles down the Ohio river from downtown Pittsburgh, and a largley residential borough.

Prepare Data for PM2.5B

#Subset only data related to PM2.5 Particles
pgh_pm25B <- pgh_air_quality[, c("site", "date", "stat", "PM25B")]
pgh_pm25B <- drop_na(pgh_pm25B,  PM25B)

kable(head(pgh_pm25B))

	site	date	stat	PM25B
1	Avalon	2016-01-15	Count	24
4	Avalon	2016-01-15	Min	2
5	Avalon	2016-01-15	Max	19
7	Avalon	2016-01-15	Hr. of Max	17
8	Avalon	2016-01-15	Avg	11
9	Avalon	2016-01-16	Count	24

#Rotate data so that it is one observation per row
#an observation is a measurement at a site on a day
pgh_pm25B <- spread(pgh_pm25B, stat, PM25B) 

#Add Month as Month Number
pgh_pm25B <- mutate(pgh_pm25B, month= as.numeric(format(date,"%m"))) 

kable(head(pgh_pm25B))

site	date	Avg	Count	Hr. of Max	Max	Min	month
Avalon	2016-01-15	11	24	17	19	2	1
Avalon	2016-01-16	7	24	8	22	1	1
Avalon	2016-01-17	5	24	22	12	3	1
Avalon	2016-01-18	7	24	23	11	4	1
Avalon	2016-01-19	10	24	13	22	5	1
Avalon	2016-01-20	16	24	8	22	10	1

#Average all daily max readings for a month
pgh_pm25B_avg_max <-  pgh_pm25B  %>%
  drop_na(Max) %>% 
  group_by(site,month) %>% 
  summarise(avg_max= mean(Max))

kable(head(pgh_pm25B_avg_max))

site	month	avg_max
Avalon	1	18.90000
Avalon	2	17.82759
Avalon	3	20.19355
Avalon	4	19.33333
Avalon	5	19.54839
Avalon	6	23.26667

Analyze Data for PM2.5B

ggplot(pgh_pm25B_avg_max, aes(x= as.factor(month), y = avg_max, group = site, colour= site)) + geom_line()  + xlab("Month") + ylab("Avg Max PM2.5B") + ggtitle("Avg Daily Max PM2.5B Particles for the by month for 20160115 - 20161001")

Pittsburgh Air Quality Conclusions

The graph of the Avalon and Lawrenceville, show a clear peak in the summer months. However, the Lawerenceville graph shows a spike of PM2.5 in January. It would be intresting see if there where any construction projects going on in January as Lawerenceville is a quickly gentrifying neighborhood.

Also, One should be cautious about reading too much into theese graphs as then only contain data for less then 1 year.

CUNY-DATA-607-Project2-Untidy-Data-Analysis

Raphael Nash

9/30/2016

Overview

Massey Rating

Aquire Data

Check Correlations

Conculsions

Walking Dead Viewership

Acquire Data

Tidy Data

Analyze Data

Pittsburgh Air Quality

Aquire Data

Prepare Data for PM2.5

Analyze Data for PM2.5

Prepare Data for PM2.5B

Analyze Data for PM2.5B

Pittsburgh Air Quality Conclusions