Set working directory
setwd("C:\\Users\\Andy's Home PC\\Documents\\R_Projects\\Fortune-500")
getwd() #check that working has been set
## [1] "C:/Users/Andy's Home PC/Documents/R_Projects/Fortune-500"
Manually* download file from Kaggle, and unzip it. Then load data from local drive, and identify potential problems. (*problem with automating the download and unzipping it)
data_2018 <- read.csv("C:\\Users\\Andy's Home PC\\Documents\\R_Projects\\Fortune-500\\data\\2018\\fortune1000-final.csv", header=TRUE, na.strings=c(""))
head(data_2018, 3) #let's see the first 3 records
## rank title Previous.Rank Revenues...M. Revenue.Change
## 1 1 Walmart 1 $500,343 3.00%
## 2 2 Exxon Mobil 4 $244,363 17.40%
## 3 3 Berkshire Hathaway 2 $242,137 8.30%
## Profits...M. Profit.Change Assets...M. Mkt.Value.as.of.3.29.18...M.
## 1 $9,862.00 -27.70% $204,522 $263,563
## 2 $19,710.00 151.40% $348,691 $316,157
## 3 $44,940.00 86.70% $702,095 $492,008
## Employees CEO
## 1 2,300,000 C. Douglas McMillon
## 2 71,200 Darren W. Woods
## 3 377,000 Warren E. Buffett
## CEO.Title Sector
## 1 President, Chief Executive Officer & Director Retailing
## 2 Chairman & Chief Executive Officer Energy
## 3 Chairman, President & Chief Executive Officer Financials
## Industry Years.on.Fortune.500.List
## 1 General Merchandisers 24
## 2 Petroleum Refining 24
## 3 Insurance: Property and Casualty (Stock) 24
## City State Latitude Longitude
## 1 Bentonville AR 36.37285 -94.20882
## 2 Irving TX 32.81402 -96.94889
## 3 Omaha NE 41.25654 -95.93450
Load the essential variables to a smaller dataset, and change the names of the columns
data2018sub <- data_2018[,c("rank", "title", "Revenues...M.", "Revenue.Change", "Profits...M.", "Profit.Change", "Assets...M.", "Employees", "Sector", "City", "State")]
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#target <- rename(source, newName=oldName, newName=oldName)
data2018sub <- rename(data2018sub, company=title, revenue=Revenues...M., revenueChange=Revenue.Change, profit=Profits...M., profitChange=Profit.Change, asset=Assets...M., staff=Employees, sector=Sector, city=City, state=State)
head(data2018sub, 3) #let's see the first 3 records to confirm that rename has been made
## rank company revenue revenueChange profit profitChange
## 1 1 Walmart $500,343 3.00% $9,862.00 -27.70%
## 2 2 Exxon Mobil $244,363 17.40% $19,710.00 151.40%
## 3 3 Berkshire Hathaway $242,137 8.30% $44,940.00 86.70%
## asset staff sector city state
## 1 $204,522 2,300,000 Retailing Bentonville AR
## 2 $348,691 71,200 Energy Irving TX
## 3 $702,095 377,000 Financials Omaha NE
Removing “$”, and “,” in “revenue”, “profit”, and “asset” columns; removing “%” in “revenueChange”" and “profitChange” columns (else it will cause problems when converting from factor to numeric)
data2018sub$revenue <- gsub("\\$", "", data2018sub$revenue)#need "\\" to remove special symbols
data2018sub$profit <- gsub("\\$", "", data2018sub$profit)
data2018sub$asset <- gsub("\\$", "", data2018sub$asset)
data2018sub$revenue <- gsub(",", "", data2018sub$revenue)
data2018sub$profit <- gsub(",", "", data2018sub$profit)
data2018sub$asset <- gsub(",", "", data2018sub$asset)
data2018sub$revenueChange <- gsub("%", "", data2018sub$revenueChange)
data2018sub$profitChange <- gsub("%", "", data2018sub$profitChange)
data2018sub$staff <- gsub(",", "", data2018sub$staff)
data2018sub$profit <- gsub("\\(", "-", data2018sub$profit) #as brackets can cause problems later on
data2018sub$profit<- gsub("\\)", "", data2018sub$profit)
str(data2018sub) #note that the columns above have changed from "factor" to "character"
## 'data.frame': 1000 obs. of 11 variables:
## $ rank : int 1 2 3 4 5 6 7 8 9 10 ...
## $ company : Factor w/ 1000 levels "3M","99 Cents Only Stores",..: 955 342 125 81 915 578 260 47 97 388 ...
## $ revenue : chr "500343 " "244363 " "242137 " "229234 " ...
## $ revenueChange: chr "3.00" "17.40" "8.30" "6.30" ...
## $ profit : chr "9862.00 " "19710.00 " "44940.00 " "48351.00 " ...
## $ profitChange : chr "-27.70" "151.40" "86.70" "5.80" ...
## $ asset : chr "204522 " "348691 " "702095 " "375319 " ...
## $ staff : chr "2300000" "71200" "377000" "123000" ...
## $ sector : Factor w/ 21 levels "Aerospace & Defense",..: 17 5 7 18 10 21 10 17 19 16 ...
## $ city : Factor w/ 400 levels "Akron","Allentown",..: 23 163 271 84 231 334 396 332 85 97 ...
## $ state : Factor w/ 46 levels "AL","AR","AZ",..: 2 41 28 4 23 4 38 45 41 21 ...
Following variables should be changed from “Factor” to “numeric”:revenue, revenueChange, profit, profitChange, asset, staff
#cannot work:
# data2018sub[, c("revenue", "revenueChange", "profit", "profitChange", "asset", "staff")]<- as.numeric(as.character(data2018sub[, c("revenue", "revenueChange", "profit", "profitChange", "asset", "staff")]))
#method 1 - using unlist (tested ok)
#data2018sub[, c("revenue", "revenueChange", "profit", "profitChange", "asset", "staff")]<- as.numeric(unlist(data2018sub[, c("revenue", "revenueChange", "profit", "profitChange", "asset", "staff")]))
#method 2 - using magrittr and lapply (tested ok)
# library(magrittr)
# cols = c(3, 4, 5, 6, 7, 8)
# data2018sub[,cols] %<>% lapply(function(x) as.numeric(x))
#note 1: these variables have earlier already change to character
#note 2: "%<>%" means that pipeline operator ("%>%") is applied to pass the data from variable to lapply, and then assign ("<-") it back to the variable.
#method 3 - write a loop function (tested ok)
for(i in c(3:8)) {
data2018sub[,i] <- as.numeric(data2018sub[,i])
}
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
str(data2018sub)#confirm that they changes have been made
## 'data.frame': 1000 obs. of 11 variables:
## $ rank : int 1 2 3 4 5 6 7 8 9 10 ...
## $ company : Factor w/ 1000 levels "3M","99 Cents Only Stores",..: 955 342 125 81 915 578 260 47 97 388 ...
## $ revenue : num 500343 244363 242137 229234 201159 ...
## $ revenueChange: num 3 17.4 8.3 6.3 8.8 3.1 4.1 30.8 -2 -5.5 ...
## $ profit : num 9862 19710 44940 48351 10558 ...
## $ profitChange : num -27.7 151.4 86.7 5.8 50.5 ...
## $ asset : num 204522 348691 702095 375319 139058 ...
## $ staff : num 2300000 71200 377000 123000 260000 64500 203000 566000 254000 180000 ...
## $ sector : Factor w/ 21 levels "Aerospace & Defense",..: 17 5 7 18 10 21 10 17 19 16 ...
## $ city : Factor w/ 400 levels "Akron","Allentown",..: 23 163 271 84 231 334 396 332 85 97 ...
## $ state : Factor w/ 46 levels "AL","AR","AZ",..: 2 41 28 4 23 4 38 45 41 21 ...
Removing the “state” column
data2018sub$state <- NULL
Change “company” from “factor” to “character”
data2018sub$company <- as.character(data2018sub$company)
str(data2018sub) #confirm that changes have been made
## 'data.frame': 1000 obs. of 10 variables:
## $ rank : int 1 2 3 4 5 6 7 8 9 10 ...
## $ company : chr "Walmart" "Exxon Mobil" "Berkshire Hathaway" "Apple" ...
## $ revenue : num 500343 244363 242137 229234 201159 ...
## $ revenueChange: num 3 17.4 8.3 6.3 8.8 3.1 4.1 30.8 -2 -5.5 ...
## $ profit : num 9862 19710 44940 48351 10558 ...
## $ profitChange : num -27.7 151.4 86.7 5.8 50.5 ...
## $ asset : num 204522 348691 702095 375319 139058 ...
## $ staff : num 2300000 71200 377000 123000 260000 64500 203000 566000 254000 180000 ...
## $ sector : Factor w/ 21 levels "Aerospace & Defense",..: 17 5 7 18 10 21 10 17 19 16 ...
## $ city : Factor w/ 400 levels "Akron","Allentown",..: 23 163 271 84 231 334 396 332 85 97 ...
There are 1000 records, let’s reduce the number of records by focusing on top 500 companies and check missing data again
data2018sub1 <- data2018sub[1:500,] #load only 500 records to a sub-list
length(data2018sub1[!complete.cases(data2018sub1),]) #check number of missing records
## [1] 10
Let’s pull out all records with missing data in specific columns in “revenue”" and “revenue change” columns
data2018sub1[is.na(data2018sub1$revenue),] # returns 0 rows => no missing data in "revenue" columns
## [1] rank company revenue revenueChange profit
## [6] profitChange asset staff sector city
## <0 rows> (or 0-length row.names)
data2018sub1[is.na(data2018sub1$revenueChange),] #2 rows: "Fortive" amd "Conduent"
## rank company revenue revenueChange profit profitChange asset staff
## 420 420 Fortive 6656 NA 1044.5 NA 10501 26000
## 466 466 Conduent 6022 NA 181.0 NA 7548 90000
## sector city
## 420 Industrials Everett
## 466 Business Services Florham Park
let’s pull out the records with missing data in “profit” column
data2018sub1[is.na(data2018sub1$profit), "company"] #"Peabody Energy"
## [1] "Peabody Energy"
data2018sub1[data2018sub1$company=="Peabody Energy", "sector"] #"Energy"
## [1] Energy
## 21 Levels: Aerospace & Defense Apparel Business Services ... Wholesalers
data2018sub1[data2018sub1$sector=="Energy", "company"]#59 companies
## [1] "Exxon Mobil" "Chevron"
## [3] "Phillips 66" "Valero Energy"
## [5] "Marathon Petroleum" "Energy Transfer Equity"
## [7] "Andeavor" "World Fuel Services"
## [9] "Exelon" "ConocoPhillips"
## [11] "Enterprise Products Partners" "Plains GP Holdings"
## [13] "Duke Energy" "Southern"
## [15] "PBF Energy" "Halliburton"
## [17] "NextEra Energy" "PG& E Corp."
## [19] "Freeport-McMoRan" "American Electric Power"
## [21] "HollyFrontier" "Devon Energy"
## [23] "AES" "Kinder Morgan"
## [25] "FirstEnergy" "Occidental Petroleum"
## [27] "NGL Energy Partners" "DTE Energy"
## [29] "Dominion Energy" "Edison International"
## [31] "Oneok" "Consolidated Edison"
## [33] "Anadarko Petroleum" "Xcel Energy"
## [35] "NRG Energy" "EOG Resources"
## [37] "Sempra Energy" "Entergy"
## [39] "CenterPoint Energy" "Chesapeake Energy"
## [41] "Public Service Enterprise Group" "Targa Resources"
## [43] "Calpine" "DCP Midstream"
## [45] "Williams" "Eversource Energy"
## [47] "WEC Energy Group" "PPL"
## [49] "Delek US Holdings" "Newmont Mining"
## [51] "National Oilwell Varco" "CMS Energy"
## [53] "Apache" "Ameren"
## [55] "UGI" "Cheniere Energy"
## [57] "Peabody Energy" "Pioneer Natural Resources"
## [59] "Vistra Energy"
median(data2018sub1[data2018sub1$sector=="Energy", "profit"], na.rm=TRUE) # median profit of company in energy sector is $870m.
## [1] 870
data2018sub1[data2018sub1$company=="Peabody Energy", "profit"] <- median(data2018sub1[data2018sub1$sector=="Energy", "profit"], na.rm=TRUE)#using median value of profit in energy sector to replace the missing profit in "Peabody Energy"
data2018sub1[data2018sub1$company=="Peabody Energy", "profit"] #confirm that the replacement is successful
## [1] 870
let’s pull out the records with missing data in “asset” , “staff” columns
data2018sub1[is.na(data2018sub1$asset), "company"] # 0 => no missing records in "asset" coulmn
## character(0)
data2018sub1[is.na(data2018sub1$staff), "company"] #0 => no missing records in "staff" column
## character(0)
Actually, we don’t really need the revenueChange and profitChange columns, so let’s remove them
data2018sub1$revenueChange <- NULL
data2018sub1$profitChange <- NULL
str(data2018sub1) #confirm that these columns have been removed
## 'data.frame': 500 obs. of 8 variables:
## $ rank : int 1 2 3 4 5 6 7 8 9 10 ...
## $ company: chr "Walmart" "Exxon Mobil" "Berkshire Hathaway" "Apple" ...
## $ revenue: num 500343 244363 242137 229234 201159 ...
## $ profit : num 9862 19710 44940 48351 10558 ...
## $ asset : num 204522 348691 702095 375319 139058 ...
## $ staff : num 2300000 71200 377000 123000 260000 64500 203000 566000 254000 180000 ...
## $ sector : Factor w/ 21 levels "Aerospace & Defense",..: 17 5 7 18 10 21 10 17 19 16 ...
## $ city : Factor w/ 400 levels "Akron","Allentown",..: 23 163 271 84 231 334 396 332 85 97 ...
Check for any other missing data
data2018sub1[!complete.cases(data2018sub1),]
## rank company revenue profit asset staff sector city
## 402 402 JetBlue Airways 7015 1147 9781 17424 Transportation <NA>
The city of “JetBlue Airways” is missing. A search in the web confirms that the city should be “New York”. Let’s insert this into the record.
data2018sub1[data2018sub1$company=="JetBlue Airways", "city"] <- "New York"
data2018sub1[data2018sub1$company=="JetBlue Airways", "city"]
## [1] New York
## 400 Levels: Akron Allentown Ames Andover Ankeny ... Zeeland
Check for any other missing data (if any)
data2018sub1[is.na(data2018sub1),] #we can also use data2018sub1[!complete.cases(data2018sub1),]
## [1] rank company revenue profit asset staff sector city
## <0 rows> (or 0-length row.names)
Confirmed that no further missing records.