library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
url <- "https://raw.githubusercontent.com/mkds/IS607_Project3/gh-pages/Data/Data_All_MASTER_edit.csv"
data <- read.csv(url, stringsAsFactors=FALSE, header= TRUE)
View(data)
data <- data[-1:-2269,]
unique(data$Category)
## [1] "ACTOR"
## [2] "ACTOR IN A SUPPORTING ROLE"
## [3] "ACTRESS"
## [4] "ACTRESS IN A SUPPORTING ROLE"
## [5] "ART DIRECTION (Black-and-White)"
## [6] "ART DIRECTION (Color)"
## [7] "CINEMATOGRAPHY (Black-and-White)"
## [8] "CINEMATOGRAPHY (Color)"
## [9] "COSTUME DESIGN (Black-and-White)"
## [10] "COSTUME DESIGN (Color)"
## [11] "DIRECTING"
## [12] "DOCUMENTARY (Feature)"
## [13] "DOCUMENTARY (Short Subject)"
## [14] "FILM EDITING"
## [15] "MUSIC (Music Score of a Dramatic or Comedy Picture)"
## [16] "MUSIC (Scoring of a Musical Picture)"
## [17] "MUSIC (Song)"
## [18] "BEST MOTION PICTURE"
## [19] "SHORT SUBJECT (Cartoon)"
## [20] "SHORT SUBJECT (One-reel)"
## [21] "SHORT SUBJECT (Two-reel)"
## [22] "SOUND RECORDING"
## [23] "SPECIAL EFFECTS"
## [24] "WRITING (Motion Picture Story)"
## [25] "WRITING (Screenplay)"
## [26] "WRITING (Story and Screenplay)"
## [27] "HONORARY FOREIGN LANGUAGE FILM AWARD"
## [28] "HONORARY AWARD"
## [29] "IRVING G. THALBERG MEMORIAL AWARD"
## [30] "SCIENTIFIC OR TECHNICAL AWARD (Class II)"
## [31] "SCIENTIFIC OR TECHNICAL AWARD (Class III)"
## [32] "SCIENTIFIC OR TECHNICAL AWARD (Class I)"
## [33] "FOREIGN LANGUAGE FILM"
## [34] "WRITING (Screenplay--Adapted)"
## [35] "WRITING (Screenplay--Original)"
## [36] "JEAN HERSHOLT HUMANITARIAN AWARD"
## [37] "ART DIRECTION"
## [38] "CINEMATOGRAPHY"
## [39] "COSTUME DESIGN"
## [40] "MUSIC (Scoring)"
## [41] "SHORT SUBJECT (Live Action)"
## [42] "WRITING (Screenplay--based on material from another medium)"
## [43] "WRITING (Story and Screenplay--written directly for the screen)"
## [44] "SOUND"
## [45] "The Longest Day -- Jean Bourgoin; Walter Wottitz; (Henri Persin)"
## [46] "MUSIC (Music Score--substantially original)"
## [47] "MUSIC (Scoring of Music--adaptation or treatment)"
## [48] "BEST PICTURE"
## [49] "SOUND EFFECTS"
## [50] "SPECIAL VISUAL EFFECTS"
## [51] "MUSIC (Original Music Score)"
## [52] "MUSIC (Original Score--for a motion picture [not a musical])"
## [53] "MUSIC (Score of a Musical Picture--original or adaptation)"
## [54] "MUSIC (Song--Original for the Picture)"
## [55] "WRITING (Story and Screenplay--based on material not previously published or produced)"
## [56] "MUSIC (Original Score)"
## [57] "MUSIC (Original Song Score)"
## [58] "For All We Know from Lovers and Other Strangers -- Music by Fred Karlin; Lyrics by Robb Royer (aka Robb Wilson) and James Griffin (aka Arthur James)"
## [59] "WRITING (Story and Screenplay--based on factual material or material not previously published or produced)"
## [60] "MUSIC (Original Dramatic Score)"
## [61] "MUSIC (Scoring: Adaptation and Original Song Score)"
## [62] "SHORT SUBJECT (Animated)"
## [63] "SPECIAL ACHIEVEMENT AWARD (Visual Effects)"
## [64] "MUSIC (Scoring: Original Song Score and Adaptation -or- Scoring: Adaptation)"
## [65] "SHORT FILM (Animated)"
## [66] "SHORT FILM (Live Action)"
## [67] "WRITING (Original Screenplay)"
## [68] "WRITING (Screenplay Adapted from Other Material)"
## [69] "MUSIC (Original Song)"
## [70] "SPECIAL ACHIEVEMENT AWARD (Sound Effects)"
## [71] "ACTOR IN A LEADING ROLE"
## [72] "ACTRESS IN A LEADING ROLE"
## [73] "MUSIC (Original Song Score and Its Adaptation or Adaptation Score)"
## [74] "WRITING (Screenplay Written Directly for the Screen--based on factual material or on story material not previously published or produced)"
## [75] "VISUAL EFFECTS"
## [76] "SPECIAL ACHIEVEMENT AWARD"
## [77] "SPECIAL ACHIEVEMENT AWARD (Sound Effects Editing)"
## [78] "MEDAL OF COMMENDATION"
## [79] "MUSIC (Adaptation Score)"
## [80] "WRITING (Screenplay Based on Material from Another Medium)"
## [81] "WRITING (Screenplay Written Directly for the Screen)"
## [82] "SCIENTIFIC OR TECHNICAL AWARD (Academy Award of Merit)"
## [83] "SCIENTIFIC OR TECHNICAL AWARD (Scientific and Engineering Award)"
## [84] "SCIENTIFIC OR TECHNICAL AWARD (Technical Achievement Award)"
## [85] "MUSIC (Original Song Score and Its Adaptation -or- Adaptation Score)"
## [86] "SPECIAL ACHIEVEMENT AWARD (Sound Editing)"
## [87] "SHORT FILM (Dramatic Live Action)"
## [88] "MAKEUP"
## [89] "GORDON E. SAWYER AWARD"
## [90] "SOUND EFFECTS EDITING"
## [91] "MUSIC (Original Song Score or Adaptation Score)"
## [92] "AWARD OF COMMENDATION"
## [93] "JFK -- Robert Richardson"
## [94] "JFK -- Oliver Stone"
## [95] "JFK -- Joe Hutshing; Pietro Scalia"
## [96] "JFK -- John Williams"
## [97] "JFK -- A. Kitman Ho and Oliver Stone; Producers"
## [98] "JFK -- Michael Minkler; Gregg Landaker; Tod A. Maitland"
## [99] "WRITING (Screenplay Based on Material Previously Produced or Published)"
## [100] "JFK -- Oliver Stone; Zachary Sklar"
## [101] "SCIENTIFIC AND TECHNICAL AWARD (Academy Award of Merit)"
## [102] "SCIENTIFIC AND TECHNICAL AWARD (Scientific and Engineering Award)"
## [103] "SCIENTIFIC AND TECHNICAL AWARD (Technical Achievement Award)"
## [104] "MUSIC (Original Musical or Comedy Score)"
## [105] "JOHN A. BONNER MEDAL OF COMMENDATION"
## [106] "SOUND EDITING"
## [107] "ANIMATED FEATURE FILM"
## [108] "WRITING (Adapted Screenplay)"
## [109] "SOUND MIXING"
## [110] "WALL-E -- Ben Burtt and Matthew Wood"
## [111] "WALL-E -- Tom Myers; Michael Semanick and Ben Burtt"
## [112] "MAKEUP AND HAIRSTYLING"
## [113] "PRODUCTION DESIGN"
#A lot of awards for the same thing are labeled differently throughout the years This corrects that. Sound
#Sound editing encompasses all recording of audio, whereas sound mixing is deciding how to merge them for the finished product
data$Category <- str_replace_all(data$Category, "(MUSIC).*", replacement = "MUSIC")
data$Category <- str_replace_all(data$Category, "(CINEMATOGRAPHY).*", replacement = "CINEMATOGRAPHY")
data$Category <- str_replace_all(data$Category, "(COSTUME DESIGN).*", replacement = "COSTUME DESIGN")
data$Category <- str_replace_all(data$Category, "(ART DIRECTION).*", replacement = "ART DIRECTION")
data$Category <- str_replace_all(data$Category, ".*(VISUAL EFFECTS)", replacement = "SPECIAL EFFECTS")
data$Category <- str_replace_all(data$Category, "(WRITING).*", replacement = "WRITING")
data$Category <- str_replace_all(data$Category, "(MAKEUP).*", replacement = "MAKEUP")
data$Category <- str_replace_all(data$Category, "(ACTOR IN A LEADING ROLE)", replacement = "ACTOR")
data$Category <- str_replace_all(data$Category, "(ACTRESS IN A LEADING ROLE)", replacement = "ACTRESS")
data$Category <- str_replace_all(data$Category, "(BEST MOTION PICTURE)", replacement = "BEST PICTURE")
data$Category <- str_replace_all(data$Category, "(SOUND EFFECTS EDITING)", replacement = "SOUND EDITING")
data$Category <- str_replace_all(data$Category, "(SOUND RECORDING)", replacement = "SOUND EDITING")
data$Category <- str_replace_all(data$Category, "(SOUND EFFECTS)", replacement = "SOUND EDITING")
data$Category <- str_replace_all(data$Category, "(SOUND$)", replacement = "SOUND MIXING")
data$Category <- str_replace_all(data$Category, "(For All We Know).*", replacement = "MUSIC")
data$Nominee <- str_replace_all(data$Nominee, "(For All We Know).*", replacement = "For All We Know from Lovers and Other Strangers")
#Had some issuees with the actors and actresses not pulling properly to the totals
trim <- function(x) gsub(",|^[[:space:]]+|[[:space:]]+$", "", x)
data <- data.frame(sapply(data, trim), check.names = FALSE)
#Filter to choose the categories that we will evaluate
datanew <- filter(data, Category == "ACTOR" | Category == "ACTOR IN A SUPPORTING ROLE"
| Category == "ACTRESS" | Category == "ACTRESS IN A SUPPORTING ROLE"
| Category == "ART DIRECTION" | Category == "CINEMATOGRAPHY"
| Category == "COSTUME DESIGN" | Category == "DIRECTING"
| Category == "FILM EDITING" | Category == "MUSIC"
| Category == "BEST PICTURE" | Category == "SPECIAL EFFECTS"
| Category == "WRITING" | Category == "MAKEUP" | Category == "SOUND EDITING")
head(datanew)
## Year Category Nominee
## 1 1950 (23rd) ACTOR The Magnificent Yankee
## 2 1950 (23rd) ACTOR Cyrano de Bergerac
## 3 1950 (23rd) ACTOR Sunset Blvd.
## 4 1950 (23rd) ACTOR Harvey
## 5 1950 (23rd) ACTOR Father of the Bride
## 6 1950 (23rd) ACTOR IN A SUPPORTING ROLE Broken Arrow
## Additional.Info Won.
## 1 Louis Calhern no
## 2 Jos� Ferrer yes
## 3 William Holden no
## 4 James Stewart no
## 5 Spencer Tracy no
## 6 Jeff Chandler no
#Much better in terms of the count of categories
unique(datanew$Category)
## [1] ACTOR ACTOR IN A SUPPORTING ROLE
## [3] ACTRESS ACTRESS IN A SUPPORTING ROLE
## [5] ART DIRECTION CINEMATOGRAPHY
## [7] COSTUME DESIGN DIRECTING
## [9] FILM EDITING MUSIC
## [11] BEST PICTURE SOUND EDITING
## [13] SPECIAL EFFECTS WRITING
## [15] MAKEUP
## 61 Levels: ACTOR ACTOR IN A SUPPORTING ROLE ... WRITING
write.csv(datanew, file = "data_categories.csv",row.names=FALSE)
#remove Best Picture from the df as that is no longer a category we want as an observation
#convert the values to integers
main <- filter(datanew, Category != "BEST PICTURE")
colnames(main)[5] <- "won category"
main$`won category` <- str_replace_all(main$`won category`, "yes", "1")
main$`won category` <- str_replace_all(main$`won category`, "no", "0")
main$`won category` <- as.integer(main$`won category`)
head(main)
## Year Category Nominee
## 1 1950 (23rd) ACTOR The Magnificent Yankee
## 2 1950 (23rd) ACTOR Cyrano de Bergerac
## 3 1950 (23rd) ACTOR Sunset Blvd.
## 4 1950 (23rd) ACTOR Harvey
## 5 1950 (23rd) ACTOR Father of the Bride
## 6 1950 (23rd) ACTOR IN A SUPPORTING ROLE Broken Arrow
## Additional.Info won category
## 1 Louis Calhern 0
## 2 Jos� Ferrer 1
## 3 William Holden 0
## 4 James Stewart 0
## 5 Spencer Tracy 0
## 6 Jeff Chandler 0
#create a seperate df so we can merge the two properly
bp <- filter(datanew, Category == "BEST PICTURE")
bp <- bp[,-4]
bp <- bp[,-1:-2]
colnames(bp)[2] <- "bp won"
#all of the movies here were nominated so we can give them a 1
bp$`bp nomination` <- "1"
bp$`bp nomination` <- as.integer(bp$`bp nomination`)
head(bp)
## Nominee bp won bp nomination
## 1 All about Eve yes 1
## 2 Born Yesterday no 1
## 3 Father of the Bride no 1
## 4 King Solomon's Mines no 1
## 5 Sunset Blvd. no 1
## 6 An American in Paris yes 1
#again make the values integers to get a later frequency count
bp$`bp won` <- str_replace_all(bp$`bp won`, "yes", "1")
bp$`bp won` <- str_replace_all(bp$`bp won`, "no", "0")
bp$`bp won` <- as.integer(bp$`bp won`)
#merge the two to apply the best picture info to our
combined <- merge(main, bp, by = 'Nominee', all.x = TRUE)
combined$Nominee <- as.character(combined$Nominee)
combined$Year <- as.character(combined$Year)
combined$Category <- as.character(combined$Category)
combined$Additional.Info <- as.character(combined$Additional.Info)
#replace NAs with zeros
combined[is.na(combined)] <- 0
#all of these observations represent category nominations so we can assign a 1
combined$`category nomination` <- 1
combined$`category nomination` <- as.integer(combined$`category nomination`)
combined$`bp nomination` <- as.integer(combined$`bp nomination`)
combined$`bp won` <- as.integer(combined$`bp won`)
#we can see how many movies lost the bp nomination by subtracting these two:
combined <- combined %>%
mutate(`bp lost` = `bp nomination` - `bp won`)
#now we are ready to get our frequency counts
summary <- combined %>%
group_by(Category) %>%
summarize(sum(`category nomination`), sum(`won category`), sum(`bp nomination`),
sum(`bp won`), sum(`bp lost`))
names(summary) <- c("category", "category nomination", "won category", "bp nomination","bp won", "bp lost")
View(summary)
write.csv(summary, file = "frequencies.csv",row.names=FALSE)