Libraries used
library(rvest)
## Loading required package: xml2
library(tidyverse)
## -- Attaching packages ---------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.0.0 v purrr 0.2.5
## v tibble 1.4.2 v dplyr 0.7.6
## v tidyr 0.8.1 v stringr 1.3.1
## v readr 1.1.1 v forcats 0.3.0
## -- Conflicts ------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x readr::guess_encoding() masks rvest::guess_encoding()
## x dplyr::lag() masks stats::lag()
## x purrr::pluck() masks rvest::pluck()
library(stringr)
library(ggplot2)
Step 1: Web-Scraping and Initial Cleaning
futon=read_html("http://thefutoncritic.com/showatch.aspx?sort=yearstart&series=&network=&daycode=&statuscode=&genre=drama&studio=")
series1=futon %>% html_nodes("table+ table tr+ tr a") %>% html_text()
series2=series1[12:length(series1)] #the first 11 values have to be removed
date1=futon %>% html_nodes("td td table+ table td:nth-child(2)") %>% html_text()
date1=gsub("\r\n","", date1) #have to get clean the strings a bit
date1=gsub("\t","", date1)
date2=date1[4:length(date1)] #the first three values have to be removed
network1=futon %>% html_nodes("td td table+ table td:nth-child(4)") %>% html_text()
network2=network1[4:length(network1)] #the first three values have to be removed
futondf=data.frame(cbind(series2,network2,date2))
head(futondf, n=100)
## series2 network2
## 1 12 MILES OF BAD ROAD HBO
## 2 1983 NETFLIX
## 3 54 HOURS SUNDANCE NOW
## 4 ABC MURDERS, THE AMAZON
## 5 BABY NETFLIX
## 6 BLACK EARTH RISING NETFLIX
## 7 BOYS, THE AMAZON
## 8 CARNIVAL ROW AMAZON
## 9 CHEAT SUNDANCE NOW
## 10 CHRONICLE MYSTERIES, THE HMC
## 11 CITY ON A HILL SHOWTIME
## 12 CODE, THE CBS
## 13 COISA MAIS LINDA NETFLIX
## 14 CROSSWORD MYSTERIES HMC
## 15 CRY, THE SUNDANCE NOW
## 16 DEADLY CLASS SYFY
## 17 DISAPPEARANCE, THE WGN
## 18 DOGS OF BERLIN NETFLIX
## 19 ENEMY WITHIN, THE NBC
## 20 FIX, THE ABC
## 21 FLACK POP
## 22 FLING FOX
## 23 FOOTBALLERS' WIVES BBC AMERICA
## 24 FOREIGN BODIES TNT
## 25 FOYLE'S WAR PBS
## 26 GENTLEMAN JACK HBO
## 27 GONE WGN
## 28 GOOD OMENS AMAZON
## 29 GRAND HOTEL ABC
## 30 HAUNTING OF HILL HOUSE, THE NETFLIX
## 31 HEARTLAND UP
## 32 HEATHERS PARAMOUNT
## 33 I AM THE NIGHT TNT
## 34 IN THE DARK CW
## 35 INBETWEEN, THE NBC
## 36 INSPECTOR LEWIS PBS
## 37 JOHN WOO'S ONCE A THIEF SYNDICATION
## 38 MANCHESTER PREP FOX
## 39 MASTERPIECE! PBS
## 40 MESSIAH NETFLIX
## 41 MISS MARPLE PBS
## 42 MY BRILLIANT FRIEND HBO
## 43 NAME OF THE ROSE, THE SUNDANCE
## 44 NEW POPE, THE HBO
## 45 NIGHTFLYERS SYFY
## 46 NORTHERN RESCUE NETFLIX
## 47 ON BECOMING A GOD IN CENTRAL FLORIDA YOUTUBE
## 48 ORDER, THE NETFLIX
## 49 ORIGIN YOUTUBE
## 50 OSMOSIS NETFLIX
## 51 PASSAGE, THE FOX
## 52 PICTURE PERFECT HMC
## 53 PINE GAP NETFLIX
## 54 POIROT PBS
## 55 PRETTY LITTLE LIARS: THE PERFECTIONISTS FREEFORM
## 56 PROTECTOR, THE NETFLIX
## 57 PROVEN INNOCENT FOX
## 58 QUICKSAND NETFLIX
## 59 RED LINE, THE CBS
## 60 ROOK, THE STARZ
## 61 ROSWELL, NEW MEXICO CW
## 62 SEPTUPLETS FOX
## 63 SEX EDUCATION NETFLIX
## 64 SIEMPRE BRUJA NETFLIX
## 65 SILENT WITNESS BBC AMERICA
## 66 SPANISH PRINCESS, THE STARZ
## 67 STARHUNTER SYNDICATION
## 68 STILL LIFE FOX
## 69 TITANS DC
## 70 TOO OLD TO DIE YOUNG AMAZON
## 71 TRUTH ABOUT THE HARRY QUEBERT AFFAIR, THE EPIX
## 72 UMBRELLA ACADEMY, THE NETFLIX
## 73 UNDERCOVER NETFLIX
## 74 UNSPEAKABLE SUNDANCE
## 75 VALLEY OF THE BOOM NGC
## 76 VANITY FAIR AMAZON
## 77 VILLAGE, THE NBC
## 78 WAKING THE DEAD BBC AMERICA
## 79 WALLANDER PBS
## 80 WARRIOR CINEMAX
## 81 WATERFRONT CBS
## 82 WHISKEY CAVALIER ABC
## 83 WHITE DRAGON AMAZON
## 84 WIRE IN THE BLOOD BBC AMERICA
## 85 WONDERFUL WORLD OF DISNEY, THE ABC
## 86 BAYWATCH SYNDICATION
## 87 LAW & ORDER NBC
## 88 BEVERLY HILLS, 90210 FOX
## 89 DIAGNOSIS MURDER CBS
## 90 WALKER, TEXAS RANGER CBS
## 91 X-FILES, THE FOX
## 92 NYPD BLUE ABC
## 93 PARTY OF FIVE FOX
## 94 CHICAGO HOPE CBS
## 95 ER NBC
## 96 TOUCHED BY AN ANGEL CBS
## 97 STAR TREK: VOYAGER UPN
## 98 OUTER LIMITS, THE SCI FI
## 99 XENA: WARRIOR PRINCESS SYNDICATION
## 100 JAG CBS
## date2
## 1 ???
## 2 ???
## 3 ???
## 4 ???
## 5 ???
## 6 ???
## 7 ???
## 8 ???
## 9 ???
## 10 ???
## 11 ???
## 12 ???
## 13 ???
## 14 ???
## 15 ???
## 16 ???
## 17 ???
## 18 ???
## 19 ???
## 20 ???
## 21 ???
## 22 ???
## 23 ???-8/1/07
## 24 ???
## 25 ???-9/29/13
## 26 ???
## 27 ???
## 28 ???
## 29 ???
## 30 ???
## 31 ???
## 32 ???
## 33 ???
## 34 ???
## 35 ???
## 36 ???-8/21/16
## 37 ???
## 38 ???
## 39 ???
## 40 ???
## 41 ???-9/28/14
## 42 ???
## 43 ???
## 44 ???
## 45 ???
## 46 ???
## 47 ???
## 48 ???
## 49 ???
## 50 ???
## 51 ???
## 52 ???
## 53 ???
## 54 ???-8/3/14
## 55 ???
## 56 ???
## 57 ???
## 58 ???
## 59 ???
## 60 ???
## 61 ???
## 62 ???
## 63 ???
## 64 ???
## 65 ???-4/16/07
## 66 ???
## 67 ???
## 68 ???
## 69 ???
## 70 ???
## 71 ???
## 72 ???
## 73 ???
## 74 ???
## 75 ???
## 76 ???
## 77 ???
## 78 ???-5/28/07
## 79 ???-5/22/16
## 80 ???
## 81 ???
## 82 ???
## 83 ???
## 84 ???-5/4/08
## 85 10/27/54-12/24/08
## 86 4/23/89-5/19/01
## 87 9/13/90-5/24/10
## 88 10/4/90-5/17/00
## 89 1/5/92-5/11/01
## 90 4/21/93-5/19/01
## 91 9/10/93-3/21/18
## 92 9/21/93-3/1/05
## 93 9/12/94-5/3/00
## 94 9/18/94-5/4/00
## 95 9/19/94-4/2/09
## 96 9/21/94-4/27/03
## 97 1/16/95-5/23/01
## 98 3/26/95-1/18/02
## 99 9/15/95-6/23/01
## 100 9/23/95-4/29/05
Step 2: Table Manipulation (Cleaning, Classifying, and Filtering)
futondf2=futondf[86:length(series2),] #only interested in dramas that have actual start date data, namely rows 86+
futondf3=futondf2 %>%
separate(date2, into = c("startdate","enddate"), sep = "-") #separate start dates from end dates
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 34 rows
## [388, 417, 428, 485, 660, 677, 679, 690, 736, 774, 805, 825, 883, 945, 948,
## 949, 995, 998, 1055, 1058, ...].
futondf3$startdate=as.Date(futondf3$startdate, "%m/%d/%y")
futondf3$enddate=as.Date(futondf3$enddate, "%m/%d/%y")
futondf3$startdate=as.Date(futondf3$startdate, "%m/%d/%y")
futondf3$enddate=as.Date(futondf3$enddate, "%m/%d/%y")
futondf3$startdate[1]="1954-10-27" #fix the one year that appears to be off
futondf3$year=str_sub(futondf3$startdate,1,4) #add in a column for years only
futondf3$type=ifelse(futondf3$network2 %in%
c('ABC','CBS','FOX','NBC','CW','MYNETWORKTV','UPN','WB','PBS','PAX','ION','TELEMUNDO','UNIVISION','SYNDICATION'),"broadcast",
ifelse(futondf3$network2 %in% c('AMAZON','NETFLIX','HULU','ACORN','CBSAA','CRACKLE','SONY CRACKLE','DC','FACEBOOK','PSN','SHUDDER','YOUTUBE'),"streaming","cable")) #classify networks by type
futondf4 = futondf3 %>%
filter(startdate>=as.Date("2000-01-01")) %>%
filter(startdate<=as.Date("2017-12-31"))
head(futondf4, n=50)
## series2 network2
## 1 18 WHEELS OF JUSTICE TNN
## 2 CITY OF ANGELS CBS
## 3 CLEOPATRA 2525 SYNDICATION
## 4 OTHERS, THE NBC
## 5 COVER ME: BASED ON THE TRUE LIFE OF AN FBI FAMILY USA
## 6 SECRET AGENT MAN UPN
## 7 BEAT, THE UPN
## 8 WONDERLAND ABC
## 9 DC WB
## 10 FALCONE CBS
## 11 INVISIBLE MAN, THE SCI FI
## 12 SECRET ADVENTURES OF JULES VERNE, THE SCI FI
## 13 RESURRECTION BLVD. SHOWTIME
## 14 SOUL FOOD SHOWTIME
## 15 YOUNG AMERICANS WB
## 16 OPPOSITE SEX FOX
## 17 STRONG MEDICINE LIFETIME
## 18 MYSTERIOUS WAYS PAX
## 19 HUNTRESS, THE USA
## 20 LIVE THROUGH THIS MTV
## 21 BULL TNT
## 22 WITCHBLADE TNT
## 23 THAT'S LIFE CBS
## 24 ANDROMEDA SCI FI
## 25 DEADLINE NBC
## 26 QUEEN OF SWORDS SYNDICATION
## 27 SHEENA SYNDICATION
## 28 DARK ANGEL FOX
## 29 TITANS NBC
## 30 GILMORE GIRLS NETFLIX
## 31 CSI: CRIME SCENE INVESTIGATION CBS
## 32 FREAKYLINKS FOX
## 33 FUGITIVE, THE CBS
## 34 DISTRICT, THE CBS
## 35 IMMORTAL, THE SYNDICATION
## 36 ED NBC
## 37 GIDEON'S CROSSING ABC
## 38 BOSTON PUBLIC FOX
## 39 FREEDOM UPN
## 40 LEVEL NINE UPN
## 41 STREET, THE FOX
## 42 QUEER AS FOLK SHOWTIME
## 43 BLACK SCORPION SCI FI
## 44 DIVISION, THE LIFETIME
## 45 100 CENTRE STREET A&E
## 46 KATE BRASHER CBS
## 47 BIG APPLE CBS
## 48 LONE GUNMEN, THE FOX
## 49 DOC PAX
## 50 CHRIS ISAAK SHOW, THE SHOWTIME
## startdate enddate year type
## 1 2000-01-12 2001-06-06 2000 cable
## 2 2000-01-16 2000-12-21 2000 broadcast
## 3 2000-01-17 2001-03-05 2000 broadcast
## 4 2000-02-05 2000-06-10 2000 broadcast
## 5 2000-03-05 2001-03-24 2000 cable
## 6 2000-03-07 2000-07-28 2000 broadcast
## 7 2000-03-21 2000-04-25 2000 broadcast
## 8 2000-03-30 2000-04-06 2000 broadcast
## 9 2000-04-02 2000-04-23 2000 broadcast
## 10 2000-04-04 2000-04-12 2000 broadcast
## 11 2000-06-09 2002-02-01 2000 cable
## 12 2000-06-18 2000-12-16 2000 cable
## 13 2000-06-26 2002-09-18 2000 cable
## 14 2000-06-28 2004-05-26 2000 cable
## 15 2000-07-12 2000-08-30 2000 broadcast
## 16 2000-07-17 2000-08-21 2000 broadcast
## 17 2000-07-23 2006-02-05 2000 cable
## 18 2000-07-24 2002-05-14 2000 broadcast
## 19 2000-07-26 2001-09-09 2000 cable
## 20 2000-08-09 2000-11-16 2000 cable
## 21 2000-08-15 2000-10-24 2000 cable
## 22 2000-08-27 2002-08-26 2000 cable
## 23 2000-10-01 2002-01-26 2000 broadcast
## 24 2000-10-02 2005-05-13 2000 cable
## 25 2000-10-02 2000-10-30 2000 broadcast
## 26 2000-10-02 2001-06-02 2000 broadcast
## 27 2000-10-02 2002-02-23 2000 broadcast
## 28 2000-10-03 2002-05-03 2000 broadcast
## 29 2000-10-04 2000-12-18 2000 broadcast
## 30 2000-10-05 2016-11-25 2000 streaming
## 31 2000-10-06 2015-09-27 2000 broadcast
## 32 2000-10-06 2001-06-22 2000 broadcast
## 33 2000-10-06 2001-05-25 2000 broadcast
## 34 2000-10-07 2004-05-01 2000 broadcast
## 35 2000-10-07 2001-06-02 2000 broadcast
## 36 2000-10-08 2004-02-06 2000 broadcast
## 37 2000-10-10 2001-04-09 2000 broadcast
## 38 2000-10-23 2005-03-02 2000 broadcast
## 39 2000-10-27 2000-12-22 2000 broadcast
## 40 2000-10-27 2001-01-26 2000 broadcast
## 41 2000-11-01 2000-12-06 2000 broadcast
## 42 2000-12-03 2005-08-07 2000 cable
## 43 2001-01-05 2001-06-30 2001 cable
## 44 2001-01-07 2004-06-28 2001 cable
## 45 2001-01-15 2002-03-05 2001 cable
## 46 2001-02-24 2001-04-14 2001 broadcast
## 47 2001-03-01 2001-03-29 2001 broadcast
## 48 2001-03-04 2001-06-01 2001 broadcast
## 49 2001-03-11 2004-11-01 2001 broadcast
## 50 2001-03-12 2004-03-25 2001 cable
Step 3: Plotting the Data
ggplot()+
geom_bar(data=futondf4, aes(x=year,fill=type))+
xlab('Year')+
ylab('# of Drama Series Premieres')+
ggtitle('Number of Drama Series Premieres by Year: Broadcast, Cable, and Streaming')+
theme(plot.title = element_text(hjust = 0.5))+
theme(legend.title=element_blank())
