# Header ------------------------------------------------------------------

# File Name:          problemset2
# File Purpose:       R code for Problem Set 2
# Author:             Nakadi Yash (University of Pennsylvania)
# Date Created:     2020/06/08
# Hours Worked:     4-5
# Notes:            

# Version Log:
# Version           Date          Notes
# 1.0               2020/06/08    Created

# Install Packages & WD ---------------------------------------------------

suppressMessages(library("tidyverse"))
suppressMessages(library("dplyr"))
suppressMessages(library("nycflights13"))

#set wd
setwd("C:/Users/rnaka/Desktop/Penn Freshman Year")
# Exercise 10.2: Working with Data Frames ---------------------------------

# We will be generating random numbers in this exercise. For reproducibility, we need to set a "seed" value that will be used by the computer to generate random numbers (if your instinct is that this means random number generation isn't truly "random" - you're right!)
set.seed(19104) # The zip code of the University of Pennsylvania

# Create a vector of 100 employees ("Employee 1", "Employee 2", ... "Employee 100")
# Hint: Start by creating two vectors, one for "Employee" and one for the numbers 1 to 100. Then combine them using paste().
employ = c("Employee")
numvector = c(1:100)
employeevector = paste(employ, numvector, sep = " ")

# Create a vector of 100 random salaries for the year 2018
# Use the `sample()` function to pick random numbers between 40000 and 50000. 
# What argument should you include with sample() to say that two employees can 
# have the same salary?
salaryvector = c(sample(40000:50000, size = 100, replace = TRUE))

# Create a vector of 100 annual salary adjustments between -5000 and 10000.
# (A negative number represents a salary decrease due to corporate greed)
# Again use the `sample()` function to pick 100 random numbers in that range.
salaryadjustmentvector = c(sample(-5000:10000, size = 100, replace = TRUE))

# Create a data frame `salaries` by combining the vectors you just made. 
# Use data.frame() and include the argument `stringsAsFactors = FALSE`
salaries = data.frame(employeevector, salaryvector, salaryadjustmentvector, stringsAsFactors = FALSE)
head(salaries, 10)
##    employeevector salaryvector salaryadjustmentvector
## 1      Employee 1        46299                   9564
## 2      Employee 2        41300                   4138
## 3      Employee 3        45109                     84
## 4      Employee 4        49846                   -914
## 5      Employee 5        45042                  -2488
## 6      Employee 6        41299                   6550
## 7      Employee 7        48672                   6038
## 8      Employee 8        46934                   6010
## 9      Employee 9        45314                   6250
## 10    Employee 10        43949                   5259
# Add a column to the `salaries` data frame that represents each person's salary in 2019 (e.g., with the salary adjustment added in).
salaries$updatedsalary2019 = c(salaryvector + salaryadjustmentvector)

# Add a column to the `salaries` data frame that has a value of `TRUE` if the  person got a raise (their salary went up)
salaries$promotion = c(salaryadjustmentvector > 0)
promotion = c(salaryadjustmentvector > 0)

### Retrieve values from your data frame to answer the following questions
### Note that you should get the value as specific as possible (e.g., a single cell rather than the whole row!)

# What was the 2019 salary of Employee 57?
salaries[57, "updatedsalary2019"]
## [1] 39803
# How many employees got a raise?
length(which(salaryadjustmentvector > 0))
## [1] 71
# What was the dollar value of the highest raise?
salaries[which.max(salaryadjustmentvector), "salaryadjustmentvector"]
## [1] 9736
# What was the "name" of the employee who received the highest raise?
salaries[which.max(salaryadjustmentvector), "employeevector"]
## [1] "Employee 89"
# What was the largest decrease in salaries between the two years?
# Hint: Use abs() to get the absolute value
abs(salaries[which.min(salaryadjustmentvector), "salaryadjustmentvector"])
## [1] 4791
# What was the name of the employee who recieved largest decrease in salary?
salaries[which.min(salaryadjustmentvector), "employeevector"]
## [1] "Employee 45"
# What was the average salary change, rounded to the nearest dollar?
# Hint: An average is the same thing as a mean.
round(mean(salaryadjustmentvector))
## [1] 3119
# For people who did not get a raise, how much money did they lose on average, rounded to the nearest dollar?
noraise = salaryadjustmentvector[salaryadjustmentvector < 0]
round(mean(noraise))
## [1] -2871
# Write a .csv file of your salary data to your working directory
write.csv(salaries, "C:/Users/rnaka/Desktop/Penn Freshman Year/problemset2part1.csv")
# Exercise 11.4: Practicing with dplyr ------------------------------------

# Install the `nycflights13` package. Load the package.
# You'll also need to load `dplyr`
install.packages("nycflights13")
install.packages("dplyr")
install.packages("tidyverse")
library(nycflights13, dplyr)

# The data frame nycflights13::flights should now be accessible to you. Load it.

flightsframe = nycflights13::flights
# Use functions to inspect it: how many rows and columns does it have?
dim(flightsframe)
## [1] 336776     19
# What are the names of the columns?
names(flightsframe)
##  [1] "year"           "month"          "day"            "dep_time"      
##  [5] "sched_dep_time" "dep_delay"      "arr_time"       "sched_arr_time"
##  [9] "arr_delay"      "carrier"        "flight"         "tailnum"       
## [13] "origin"         "dest"           "air_time"       "distance"      
## [17] "hour"           "minute"         "time_hour"
# What is the source of the data (if you wanted to cite the data)?
# This part is commented out, as per Piazza instructions

# ?flights

# Use `dplyr` to give the data frame a new column called "gain_in_air" that is the amount of time gained or lost while flying
# Hint: Use the dep_delay and arr_delay columns
flightsframe <- mutate(flightsframe, gain_in_air = flightsframe$dep_delay - flightsframe$arr_delay)

# Use `dplyr` to sort your data frame in descending order by the column you just created.
flightsframe <- arrange(flightsframe, dplyr::desc(flightsframe$gain_in_air))

# For practice, repeat the last 2 steps in a single statement using the pipe operator. You will need to remove the flights data frame from your environment and reload it.
remove(flightsframe)
data(flights)
flights <- flights %>% mutate(gain_in_air = dep_delay- arr_delay) %>%   
           arrange(dplyr::desc(gain_in_air))

# On average, did flights gain or lose time?
# Note: if you are getting "NA" as a result from your averaging command, you may want to check the arguments for that function in the help file.
mean(flights$gain_in_air, na.rm = T) # Gained around 5.7 minutes of time
## [1] 5.659779
# Create a data.frame of flights headed to SeaTac ('SEA'), only including the origin, destination, total air time, and the "gain_in_air" columns using dplyr and the inside-out method
seattle = select(filter(flights, flights$dest == "SEA"), origin, dest, air_time, gain_in_air)

# On average, did flights to SeaTac gain or loose time?
mean(seattle$gain_in_air, na.rm = T) # Gained around 11.7 minutes of time
## [1] 11.6991
# Consider flights from JFK to SEA. What was the average, min, and max air time of those flights? Use pipes to answer this question in one statement.
# Hint: Use group_by() and summarize()
seattle %>% group_by(origin) %>% 
            summarize(avgtime = mean(air_time,na.rm=T),
                      mintime = min(air_time,na.rm=T),
                    maxtime = max(air_time,na.rm=T)) %>% 
  filter(origin=="JFK")
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 1 x 4
##   origin avgtime mintime maxtime
##   <chr>    <dbl>   <dbl>   <dbl>
## 1 JFK       329.     275     389
# Exercise 11.8: Exploring data sets --------------------------------------

# Read in the data (from https://fivethirtyeight.datasettes.com/fivethirtyeight/pulitzer%2Fpulitzer-circulation-data.csv). Use read.csv() with the argument `stringsAsFactors = FALSE`
dta1 <- read.csv("https://fivethirtyeight.datasettes.com/fivethirtyeight/pulitzer%2Fpulitzer-circulation-data.csv", stringsAsFactors = FALSE)


# Examine the raw data using some of the commands we have learned.

dim(dta1)
## [1] 50  8
head(dta1)
##   rowid           Newspaper Daily.Circulation..2004 Daily.Circulation..2013
## 1     1           USA Today               2,192,098               1,674,306
## 2     2 Wall Street Journal               2,101,017               2,378,827
## 3     3      New York Times               1,119,027               1,865,318
## 4     4   Los Angeles Times                 983,727                 653,868
## 5     5     Washington Post                 760,034                 474,767
## 6     6 New York Daily News                 712,671                 516,165
##   Change.in.Daily.Circulation..2004.2013
## 1                                   -24%
## 2                                   +13%
## 3                                   +67%
## 4                                   -34%
## 5                                   -38%
## 6                                   -28%
##   Pulitzer.Prize.Winners.and.Finalists..1990.2003
## 1                                               1
## 2                                              30
## 3                                              55
## 4                                              44
## 5                                              52
## 6                                               4
##   Pulitzer.Prize.Winners.and.Finalists..2004.2014
## 1                                               1
## 2                                              20
## 3                                              62
## 4                                              41
## 5                                              48
## 6                                               2
##   Pulitzer.Prize.Winners.and.Finalists..1990.2014
## 1                                               2
## 2                                              50
## 3                                             117
## 4                                              85
## 5                                             100
## 6                                               6
colnames(dta1)
## [1] "rowid"                                          
## [2] "Newspaper"                                      
## [3] "Daily.Circulation..2004"                        
## [4] "Daily.Circulation..2013"                        
## [5] "Change.in.Daily.Circulation..2004.2013"         
## [6] "Pulitzer.Prize.Winners.and.Finalists..1990.2003"
## [7] "Pulitzer.Prize.Winners.and.Finalists..2004.2014"
## [8] "Pulitzer.Prize.Winners.and.Finalists..1990.2014"
dplyr::glimpse(dta1)
## Rows: 50
## Columns: 8
## $ rowid                                           <int> 1, 2, 3, 4, 5, 6, 7...
## $ Newspaper                                       <chr> "USA Today", "Wall ...
## $ Daily.Circulation..2004                         <chr> "2,192,098", "2,101...
## $ Daily.Circulation..2013                         <chr> "1,674,306", "2,378...
## $ Change.in.Daily.Circulation..2004.2013          <chr> "-24%", "+13%", "+6...
## $ Pulitzer.Prize.Winners.and.Finalists..1990.2003 <int> 1, 30, 55, 44, 52, ...
## $ Pulitzer.Prize.Winners.and.Finalists..2004.2014 <int> 1, 20, 62, 41, 48, ...
## $ Pulitzer.Prize.Winners.and.Finalists..1990.2014 <int> 2, 50, 117, 85, 100...
# Rename the columns as appropriate.
colnames(dta1)[1]<- "RowID"
colnames(dta1)[3] <- "DailyCirculation2004"
colnames(dta1)[4]<- "DailyCirculation2013"
colnames(dta1)[5]<- "ChangeinDailyCirculation2004to2013"
colnames(dta1)[6]<- "PulitzerPrizeWinnersandFinalists,1990to2003"
colnames(dta1)[7]<- "PulitzerPrizeWinnersandFinalists,2004to2014"
colnames(dta1)[8] <- "PulitzerPrizeWinnersandFinalists,1990to2014"

# Use the gsub() command to remove unnecessary punctuations from the daily circulation and change in daily circulation columns. If you don't know how this function works, check the help file.
# Hint: Use pattern = "," and replacement = "" as arguments in sub()
dta1$DailyCirculation2004 <- gsub(pattern = ",",replacement = "",x = dta1$DailyCirculation2004)
dta1$DailyCirculation2013 <- gsub(pattern = ",",replacement = "",x = dta1$DailyCirculation2013)
dta1$ChangeinDailyCirculation2004to2013 <- gsub(pattern = "%",replacement = "",x = dta1$ChangeinDailyCirculation2004to2013)


# Create numeric versions of the daily circulation and change in daily circulation columns
dta1$Numeric2004 = as.numeric(dta1$DailyCirculation2004)
dta1$Numeric2013 = as.numeric(dta1$DailyCirculation2013)
dta1$NumericChange = as.numeric(dta1$ChangeinDailyCirculation2004to2013)

# Check to make sure that the values in the columns you just created are the same as the values in the original columns
dta1$equality2004 = dta1$DailyCirculation2004 == dta1$Numeric2004 
dta1$equality2013 = dta1$DailyCirculation2013 == dta1$Numeric2013 
dta1$equalitychange = dta1$ChangeinDailyCirculation2004to2013 == dta1$NumericChange 
#When running equalitychange, you can see there are inequality issues for the daily changes in circulation

# If there are rows with values that are NOT the same in both columns, check those rows to see if there is an error you need to correct.
dta2 = filter(dta1, dta1$equalitychange == "FALSE")
dta2[ ,which(colnames(dta2)%in%c("ChangeinDailyCirculation2004to2013","NumericChange"))]
##   ChangeinDailyCirculation2004to2013 NumericChange
## 1                                +13            13
## 2                                +67            67
## 3                                 +4             4
## 4                                 +4             4
## 5                                +22            22
## 6                                +15            15
#The issue is the plus sign from the original data, we can ignore it

# An important part about being a data scientist is asking questions. 
# Write a question you may be interested in about this data set, and then use dplyr to figure out the answer!

# Question 1: Has the global trend of moving online (between 2004 and 2013)
# hurt local news publications more or national publications (top ten) more

# Arrange by circulation change (most negative to most positive)
dta3 = arrange(dta1, dta1$ChangeinDailyCirculation2004to2013)

# Look at top ten newspapers hit hardest by the shift
dta3[1:10, "Newspaper"] # All local newspapers
##  [1] "Rocky Mountain News"        "New Orleans Times-Picayune"
##  [3] "Newark Star Ledger"         "Cleveland Plain Dealer"    
##  [5] "Philadelphia Inquirer"      "Milwaukee Journal Sentinel"
##  [7] "Tampa Bay Times"            "Seattle Times"             
##  [9] "Minneapolis Star Tribune"   "Tampa Tribune"
# Conclusion: Local newspapers have been hit the hardest by the
# shift to the online world

# Question 2: Is there popularity bias when selecting Pulitzer Prize winners?

dta4 = arrange(dta1, desc(dta1$`PulitzerPrizeWinnersandFinalists,1990to2014`))
dta4[1:10, "Newspaper"] #Mostly national sources
##  [1] "New York Times"        "Washington Post"       "Los Angeles Times"    
##  [4] "Wall Street Journal"   "Boston Globe"          "Chicago Tribune"      
##  [7] "Philadelphia Inquirer" "Miami Herald"          "Tampa Bay Times"      
## [10] "Newsday"
# Compare the top 8 national news sources to the rest of the publications:
nationalwinners = sum(dta1[1:8, "PulitzerPrizeWinnersandFinalists,1990to2014"])
localwinners = sum(dta1[9:50, "PulitzerPrizeWinnersandFinalists,1990to2014"])
nationalwinners > localwinners
## [1] TRUE
# Conclusion: Not only do national news sources receive more prizes and
# finalist awards, but the 8 most popular journals have more awards than
# the other 42 publications, which are mostly local news sources