# Header ------------------------------------------------------------------
# File Name: problemset2
# File Purpose: R code for Problem Set 2
# Author: Nakadi Yash (University of Pennsylvania)
# Date Created: 2020/06/08
# Hours Worked: 4-5
# Notes:
# Version Log:
# Version Date Notes
# 1.0 2020/06/08 Created
# Install Packages & WD ---------------------------------------------------
suppressMessages(library("tidyverse"))
suppressMessages(library("dplyr"))
suppressMessages(library("nycflights13"))
#set wd
setwd("C:/Users/rnaka/Desktop/Penn Freshman Year")
# Exercise 10.2: Working with Data Frames ---------------------------------
# We will be generating random numbers in this exercise. For reproducibility, we need to set a "seed" value that will be used by the computer to generate random numbers (if your instinct is that this means random number generation isn't truly "random" - you're right!)
set.seed(19104) # The zip code of the University of Pennsylvania
# Create a vector of 100 employees ("Employee 1", "Employee 2", ... "Employee 100")
# Hint: Start by creating two vectors, one for "Employee" and one for the numbers 1 to 100. Then combine them using paste().
employ = c("Employee")
numvector = c(1:100)
employeevector = paste(employ, numvector, sep = " ")
# Create a vector of 100 random salaries for the year 2018
# Use the `sample()` function to pick random numbers between 40000 and 50000.
# What argument should you include with sample() to say that two employees can
# have the same salary?
salaryvector = c(sample(40000:50000, size = 100, replace = TRUE))
# Create a vector of 100 annual salary adjustments between -5000 and 10000.
# (A negative number represents a salary decrease due to corporate greed)
# Again use the `sample()` function to pick 100 random numbers in that range.
salaryadjustmentvector = c(sample(-5000:10000, size = 100, replace = TRUE))
# Create a data frame `salaries` by combining the vectors you just made.
# Use data.frame() and include the argument `stringsAsFactors = FALSE`
salaries = data.frame(employeevector, salaryvector, salaryadjustmentvector, stringsAsFactors = FALSE)
head(salaries, 10)
## employeevector salaryvector salaryadjustmentvector
## 1 Employee 1 46299 9564
## 2 Employee 2 41300 4138
## 3 Employee 3 45109 84
## 4 Employee 4 49846 -914
## 5 Employee 5 45042 -2488
## 6 Employee 6 41299 6550
## 7 Employee 7 48672 6038
## 8 Employee 8 46934 6010
## 9 Employee 9 45314 6250
## 10 Employee 10 43949 5259
# Add a column to the `salaries` data frame that represents each person's salary in 2019 (e.g., with the salary adjustment added in).
salaries$updatedsalary2019 = c(salaryvector + salaryadjustmentvector)
# Add a column to the `salaries` data frame that has a value of `TRUE` if the person got a raise (their salary went up)
salaries$promotion = c(salaryadjustmentvector > 0)
promotion = c(salaryadjustmentvector > 0)
### Retrieve values from your data frame to answer the following questions
### Note that you should get the value as specific as possible (e.g., a single cell rather than the whole row!)
# What was the 2019 salary of Employee 57?
salaries[57, "updatedsalary2019"]
## [1] 39803
# How many employees got a raise?
length(which(salaryadjustmentvector > 0))
## [1] 71
# What was the dollar value of the highest raise?
salaries[which.max(salaryadjustmentvector), "salaryadjustmentvector"]
## [1] 9736
# What was the "name" of the employee who received the highest raise?
salaries[which.max(salaryadjustmentvector), "employeevector"]
## [1] "Employee 89"
# What was the largest decrease in salaries between the two years?
# Hint: Use abs() to get the absolute value
abs(salaries[which.min(salaryadjustmentvector), "salaryadjustmentvector"])
## [1] 4791
# What was the name of the employee who recieved largest decrease in salary?
salaries[which.min(salaryadjustmentvector), "employeevector"]
## [1] "Employee 45"
# What was the average salary change, rounded to the nearest dollar?
# Hint: An average is the same thing as a mean.
round(mean(salaryadjustmentvector))
## [1] 3119
# For people who did not get a raise, how much money did they lose on average, rounded to the nearest dollar?
noraise = salaryadjustmentvector[salaryadjustmentvector < 0]
round(mean(noraise))
## [1] -2871
# Write a .csv file of your salary data to your working directory
write.csv(salaries, "C:/Users/rnaka/Desktop/Penn Freshman Year/problemset2part1.csv")
# Exercise 11.4: Practicing with dplyr ------------------------------------
# Install the `nycflights13` package. Load the package.
# You'll also need to load `dplyr`
install.packages("nycflights13")
install.packages("dplyr")
install.packages("tidyverse")
library(nycflights13, dplyr)
# The data frame nycflights13::flights should now be accessible to you. Load it.
flightsframe = nycflights13::flights
# Use functions to inspect it: how many rows and columns does it have?
dim(flightsframe)
## [1] 336776 19
# What are the names of the columns?
names(flightsframe)
## [1] "year" "month" "day" "dep_time"
## [5] "sched_dep_time" "dep_delay" "arr_time" "sched_arr_time"
## [9] "arr_delay" "carrier" "flight" "tailnum"
## [13] "origin" "dest" "air_time" "distance"
## [17] "hour" "minute" "time_hour"
# What is the source of the data (if you wanted to cite the data)?
# This part is commented out, as per Piazza instructions
# ?flights
# Use `dplyr` to give the data frame a new column called "gain_in_air" that is the amount of time gained or lost while flying
# Hint: Use the dep_delay and arr_delay columns
flightsframe <- mutate(flightsframe, gain_in_air = flightsframe$dep_delay - flightsframe$arr_delay)
# Use `dplyr` to sort your data frame in descending order by the column you just created.
flightsframe <- arrange(flightsframe, dplyr::desc(flightsframe$gain_in_air))
# For practice, repeat the last 2 steps in a single statement using the pipe operator. You will need to remove the flights data frame from your environment and reload it.
remove(flightsframe)
data(flights)
flights <- flights %>% mutate(gain_in_air = dep_delay- arr_delay) %>%
arrange(dplyr::desc(gain_in_air))
# On average, did flights gain or lose time?
# Note: if you are getting "NA" as a result from your averaging command, you may want to check the arguments for that function in the help file.
mean(flights$gain_in_air, na.rm = T) # Gained around 5.7 minutes of time
## [1] 5.659779
# Create a data.frame of flights headed to SeaTac ('SEA'), only including the origin, destination, total air time, and the "gain_in_air" columns using dplyr and the inside-out method
seattle = select(filter(flights, flights$dest == "SEA"), origin, dest, air_time, gain_in_air)
# On average, did flights to SeaTac gain or loose time?
mean(seattle$gain_in_air, na.rm = T) # Gained around 11.7 minutes of time
## [1] 11.6991
# Consider flights from JFK to SEA. What was the average, min, and max air time of those flights? Use pipes to answer this question in one statement.
# Hint: Use group_by() and summarize()
seattle %>% group_by(origin) %>%
summarize(avgtime = mean(air_time,na.rm=T),
mintime = min(air_time,na.rm=T),
maxtime = max(air_time,na.rm=T)) %>%
filter(origin=="JFK")
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 1 x 4
## origin avgtime mintime maxtime
## <chr> <dbl> <dbl> <dbl>
## 1 JFK 329. 275 389
# Exercise 11.8: Exploring data sets --------------------------------------
# Read in the data (from https://fivethirtyeight.datasettes.com/fivethirtyeight/pulitzer%2Fpulitzer-circulation-data.csv). Use read.csv() with the argument `stringsAsFactors = FALSE`
dta1 <- read.csv("https://fivethirtyeight.datasettes.com/fivethirtyeight/pulitzer%2Fpulitzer-circulation-data.csv", stringsAsFactors = FALSE)
# Examine the raw data using some of the commands we have learned.
dim(dta1)
## [1] 50 8
head(dta1)
## rowid Newspaper Daily.Circulation..2004 Daily.Circulation..2013
## 1 1 USA Today 2,192,098 1,674,306
## 2 2 Wall Street Journal 2,101,017 2,378,827
## 3 3 New York Times 1,119,027 1,865,318
## 4 4 Los Angeles Times 983,727 653,868
## 5 5 Washington Post 760,034 474,767
## 6 6 New York Daily News 712,671 516,165
## Change.in.Daily.Circulation..2004.2013
## 1 -24%
## 2 +13%
## 3 +67%
## 4 -34%
## 5 -38%
## 6 -28%
## Pulitzer.Prize.Winners.and.Finalists..1990.2003
## 1 1
## 2 30
## 3 55
## 4 44
## 5 52
## 6 4
## Pulitzer.Prize.Winners.and.Finalists..2004.2014
## 1 1
## 2 20
## 3 62
## 4 41
## 5 48
## 6 2
## Pulitzer.Prize.Winners.and.Finalists..1990.2014
## 1 2
## 2 50
## 3 117
## 4 85
## 5 100
## 6 6
colnames(dta1)
## [1] "rowid"
## [2] "Newspaper"
## [3] "Daily.Circulation..2004"
## [4] "Daily.Circulation..2013"
## [5] "Change.in.Daily.Circulation..2004.2013"
## [6] "Pulitzer.Prize.Winners.and.Finalists..1990.2003"
## [7] "Pulitzer.Prize.Winners.and.Finalists..2004.2014"
## [8] "Pulitzer.Prize.Winners.and.Finalists..1990.2014"
dplyr::glimpse(dta1)
## Rows: 50
## Columns: 8
## $ rowid <int> 1, 2, 3, 4, 5, 6, 7...
## $ Newspaper <chr> "USA Today", "Wall ...
## $ Daily.Circulation..2004 <chr> "2,192,098", "2,101...
## $ Daily.Circulation..2013 <chr> "1,674,306", "2,378...
## $ Change.in.Daily.Circulation..2004.2013 <chr> "-24%", "+13%", "+6...
## $ Pulitzer.Prize.Winners.and.Finalists..1990.2003 <int> 1, 30, 55, 44, 52, ...
## $ Pulitzer.Prize.Winners.and.Finalists..2004.2014 <int> 1, 20, 62, 41, 48, ...
## $ Pulitzer.Prize.Winners.and.Finalists..1990.2014 <int> 2, 50, 117, 85, 100...
# Rename the columns as appropriate.
colnames(dta1)[1]<- "RowID"
colnames(dta1)[3] <- "DailyCirculation2004"
colnames(dta1)[4]<- "DailyCirculation2013"
colnames(dta1)[5]<- "ChangeinDailyCirculation2004to2013"
colnames(dta1)[6]<- "PulitzerPrizeWinnersandFinalists,1990to2003"
colnames(dta1)[7]<- "PulitzerPrizeWinnersandFinalists,2004to2014"
colnames(dta1)[8] <- "PulitzerPrizeWinnersandFinalists,1990to2014"
# Use the gsub() command to remove unnecessary punctuations from the daily circulation and change in daily circulation columns. If you don't know how this function works, check the help file.
# Hint: Use pattern = "," and replacement = "" as arguments in sub()
dta1$DailyCirculation2004 <- gsub(pattern = ",",replacement = "",x = dta1$DailyCirculation2004)
dta1$DailyCirculation2013 <- gsub(pattern = ",",replacement = "",x = dta1$DailyCirculation2013)
dta1$ChangeinDailyCirculation2004to2013 <- gsub(pattern = "%",replacement = "",x = dta1$ChangeinDailyCirculation2004to2013)
# Create numeric versions of the daily circulation and change in daily circulation columns
dta1$Numeric2004 = as.numeric(dta1$DailyCirculation2004)
dta1$Numeric2013 = as.numeric(dta1$DailyCirculation2013)
dta1$NumericChange = as.numeric(dta1$ChangeinDailyCirculation2004to2013)
# Check to make sure that the values in the columns you just created are the same as the values in the original columns
dta1$equality2004 = dta1$DailyCirculation2004 == dta1$Numeric2004
dta1$equality2013 = dta1$DailyCirculation2013 == dta1$Numeric2013
dta1$equalitychange = dta1$ChangeinDailyCirculation2004to2013 == dta1$NumericChange
#When running equalitychange, you can see there are inequality issues for the daily changes in circulation
# If there are rows with values that are NOT the same in both columns, check those rows to see if there is an error you need to correct.
dta2 = filter(dta1, dta1$equalitychange == "FALSE")
dta2[ ,which(colnames(dta2)%in%c("ChangeinDailyCirculation2004to2013","NumericChange"))]
## ChangeinDailyCirculation2004to2013 NumericChange
## 1 +13 13
## 2 +67 67
## 3 +4 4
## 4 +4 4
## 5 +22 22
## 6 +15 15
#The issue is the plus sign from the original data, we can ignore it
# An important part about being a data scientist is asking questions.
# Write a question you may be interested in about this data set, and then use dplyr to figure out the answer!
# Question 1: Has the global trend of moving online (between 2004 and 2013)
# hurt local news publications more or national publications (top ten) more
# Arrange by circulation change (most negative to most positive)
dta3 = arrange(dta1, dta1$ChangeinDailyCirculation2004to2013)
# Look at top ten newspapers hit hardest by the shift
dta3[1:10, "Newspaper"] # All local newspapers
## [1] "Rocky Mountain News" "New Orleans Times-Picayune"
## [3] "Newark Star Ledger" "Cleveland Plain Dealer"
## [5] "Philadelphia Inquirer" "Milwaukee Journal Sentinel"
## [7] "Tampa Bay Times" "Seattle Times"
## [9] "Minneapolis Star Tribune" "Tampa Tribune"
# Conclusion: Local newspapers have been hit the hardest by the
# shift to the online world
# Question 2: Is there popularity bias when selecting Pulitzer Prize winners?
dta4 = arrange(dta1, desc(dta1$`PulitzerPrizeWinnersandFinalists,1990to2014`))
dta4[1:10, "Newspaper"] #Mostly national sources
## [1] "New York Times" "Washington Post" "Los Angeles Times"
## [4] "Wall Street Journal" "Boston Globe" "Chicago Tribune"
## [7] "Philadelphia Inquirer" "Miami Herald" "Tampa Bay Times"
## [10] "Newsday"
# Compare the top 8 national news sources to the rest of the publications:
nationalwinners = sum(dta1[1:8, "PulitzerPrizeWinnersandFinalists,1990to2014"])
localwinners = sum(dta1[9:50, "PulitzerPrizeWinnersandFinalists,1990to2014"])
nationalwinners > localwinners
## [1] TRUE
# Conclusion: Not only do national news sources receive more prizes and
# finalist awards, but the 8 most popular journals have more awards than
# the other 42 publications, which are mostly local news sources