Cleaning up the fortune-500 2018 dataset

Intialisation

Set working directory

setwd("C:\\Users\\Andy's Home PC\\Documents\\R_Projects\\Fortune-500")
getwd() #check that working has been set

## [1] "C:/Users/Andy's Home PC/Documents/R_Projects/Fortune-500"

Manually* download file from Kaggle, and unzip it. Then load data from local drive, and identify potential problems. (*problem with automating the download and unzipping it)

data_2018 <- read.csv("C:\\Users\\Andy's Home PC\\Documents\\R_Projects\\Fortune-500\\data\\2018\\fortune1000-final.csv", header=TRUE, na.strings=c(""))
head(data_2018, 3) #let's see the first 3 records

##   rank              title Previous.Rank Revenues...M. Revenue.Change
## 1    1            Walmart             1     $500,343           3.00%
## 2    2        Exxon Mobil             4     $244,363          17.40%
## 3    3 Berkshire Hathaway             2     $242,137           8.30%
##   Profits...M. Profit.Change Assets...M. Mkt.Value.as.of.3.29.18...M.
## 1   $9,862.00        -27.70%   $204,522                     $263,563 
## 2  $19,710.00        151.40%   $348,691                     $316,157 
## 3  $44,940.00         86.70%   $702,095                     $492,008 
##   Employees                 CEO
## 1 2,300,000 C. Douglas McMillon
## 2    71,200     Darren W. Woods
## 3   377,000   Warren E. Buffett
##                                        CEO.Title     Sector
## 1 President, Chief Executive Officer &  Director  Retailing
## 2            Chairman &  Chief Executive Officer     Energy
## 3 Chairman, President &  Chief Executive Officer Financials
##                                   Industry Years.on.Fortune.500.List
## 1                    General Merchandisers                        24
## 2                       Petroleum Refining                        24
## 3 Insurance: Property and Casualty (Stock)                        24
##          City State Latitude Longitude
## 1 Bentonville    AR 36.37285 -94.20882
## 2      Irving    TX 32.81402 -96.94889
## 3       Omaha    NE 41.25654 -95.93450

Load the essential variables to a smaller dataset, and change the names of the columns

data2018sub <- data_2018[,c("rank", "title", "Revenues...M.", "Revenue.Change", "Profits...M.", "Profit.Change", "Assets...M.", "Employees", "Sector", "City", "State")]

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

#target <- rename(source, newName=oldName, newName=oldName)
data2018sub <- rename(data2018sub, company=title, revenue=Revenues...M., revenueChange=Revenue.Change, profit=Profits...M., profitChange=Profit.Change, asset=Assets...M., staff=Employees, sector=Sector, city=City, state=State)
head(data2018sub, 3) #let's see the first 3 records to confirm that rename has been made

##   rank            company   revenue revenueChange      profit profitChange
## 1    1            Walmart $500,343          3.00%  $9,862.00       -27.70%
## 2    2        Exxon Mobil $244,363         17.40% $19,710.00       151.40%
## 3    3 Berkshire Hathaway $242,137          8.30% $44,940.00        86.70%
##       asset     staff     sector        city state
## 1 $204,522  2,300,000  Retailing Bentonville    AR
## 2 $348,691     71,200     Energy      Irving    TX
## 3 $702,095    377,000 Financials       Omaha    NE

Removing “$”, and “,” in “revenue”, “profit”, and “asset” columns; removing “%” in “revenueChange”" and “profitChange” columns (else it will cause problems when converting from factor to numeric)

data2018sub$revenue <- gsub("\\$", "", data2018sub$revenue)#need "\\" to remove special symbols
data2018sub$profit <- gsub("\\$", "", data2018sub$profit)
data2018sub$asset <- gsub("\\$", "", data2018sub$asset)
data2018sub$revenue <- gsub(",", "", data2018sub$revenue)
data2018sub$profit <- gsub(",", "", data2018sub$profit)
data2018sub$asset <- gsub(",", "", data2018sub$asset)
data2018sub$revenueChange <- gsub("%", "", data2018sub$revenueChange)
data2018sub$profitChange <- gsub("%", "", data2018sub$profitChange)
data2018sub$staff <- gsub(",", "", data2018sub$staff)
data2018sub$profit <- gsub("\\(", "-", data2018sub$profit) #as brackets can cause problems later on
data2018sub$profit<- gsub("\\)", "", data2018sub$profit)
str(data2018sub) #note that the columns above have changed from "factor" to "character"

## 'data.frame':    1000 obs. of  11 variables:
##  $ rank         : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ company      : Factor w/ 1000 levels "3M","99 Cents Only Stores",..: 955 342 125 81 915 578 260 47 97 388 ...
##  $ revenue      : chr  "500343 " "244363 " "242137 " "229234 " ...
##  $ revenueChange: chr  "3.00" "17.40" "8.30" "6.30" ...
##  $ profit       : chr  "9862.00 " "19710.00 " "44940.00 " "48351.00 " ...
##  $ profitChange : chr  "-27.70" "151.40" "86.70" "5.80" ...
##  $ asset        : chr  "204522 " "348691 " "702095 " "375319 " ...
##  $ staff        : chr  "2300000" "71200" "377000" "123000" ...
##  $ sector       : Factor w/ 21 levels "Aerospace &  Defense",..: 17 5 7 18 10 21 10 17 19 16 ...
##  $ city         : Factor w/ 400 levels "Akron","Allentown",..: 23 163 271 84 231 334 396 332 85 97 ...
##  $ state        : Factor w/ 46 levels "AL","AR","AZ",..: 2 41 28 4 23 4 38 45 41 21 ...

Following variables should be changed from “Factor” to “numeric”:revenue, revenueChange, profit, profitChange, asset, staff

  #cannot work:
    # data2018sub[, c("revenue", "revenueChange", "profit", "profitChange", "asset", "staff")]<- as.numeric(as.character(data2018sub[, c("revenue", "revenueChange", "profit", "profitChange", "asset", "staff")]))

  #method 1 - using unlist (tested ok)
#data2018sub[, c("revenue", "revenueChange", "profit", "profitChange", "asset", "staff")]<- as.numeric(unlist(data2018sub[, c("revenue", "revenueChange", "profit", "profitChange", "asset", "staff")]))

  #method 2 - using magrittr and lapply (tested ok)
# library(magrittr)
# cols = c(3, 4, 5, 6, 7, 8)
# data2018sub[,cols] %<>% lapply(function(x) as.numeric(x)) 
    #note 1: these variables have earlier already change to character
    #note 2: "%<>%" means that pipeline operator ("%>%") is applied to pass the data from variable to lapply, and then assign ("<-") it back to the variable.

  #method 3 - write a loop function (tested ok)
for(i in c(3:8)) {
    data2018sub[,i] <- as.numeric(data2018sub[,i])
}

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

str(data2018sub)#confirm that they changes have been made

## 'data.frame':    1000 obs. of  11 variables:
##  $ rank         : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ company      : Factor w/ 1000 levels "3M","99 Cents Only Stores",..: 955 342 125 81 915 578 260 47 97 388 ...
##  $ revenue      : num  500343 244363 242137 229234 201159 ...
##  $ revenueChange: num  3 17.4 8.3 6.3 8.8 3.1 4.1 30.8 -2 -5.5 ...
##  $ profit       : num  9862 19710 44940 48351 10558 ...
##  $ profitChange : num  -27.7 151.4 86.7 5.8 50.5 ...
##  $ asset        : num  204522 348691 702095 375319 139058 ...
##  $ staff        : num  2300000 71200 377000 123000 260000 64500 203000 566000 254000 180000 ...
##  $ sector       : Factor w/ 21 levels "Aerospace &  Defense",..: 17 5 7 18 10 21 10 17 19 16 ...
##  $ city         : Factor w/ 400 levels "Akron","Allentown",..: 23 163 271 84 231 334 396 332 85 97 ...
##  $ state        : Factor w/ 46 levels "AL","AR","AZ",..: 2 41 28 4 23 4 38 45 41 21 ...

Removing the “state” column

data2018sub$state <- NULL

Change “company” from “factor” to “character”

data2018sub$company <- as.character(data2018sub$company)
str(data2018sub) #confirm that changes have been made

## 'data.frame':    1000 obs. of  10 variables:
##  $ rank         : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ company      : chr  "Walmart" "Exxon Mobil" "Berkshire Hathaway" "Apple" ...
##  $ revenue      : num  500343 244363 242137 229234 201159 ...
##  $ revenueChange: num  3 17.4 8.3 6.3 8.8 3.1 4.1 30.8 -2 -5.5 ...
##  $ profit       : num  9862 19710 44940 48351 10558 ...
##  $ profitChange : num  -27.7 151.4 86.7 5.8 50.5 ...
##  $ asset        : num  204522 348691 702095 375319 139058 ...
##  $ staff        : num  2300000 71200 377000 123000 260000 64500 203000 566000 254000 180000 ...
##  $ sector       : Factor w/ 21 levels "Aerospace &  Defense",..: 17 5 7 18 10 21 10 17 19 16 ...
##  $ city         : Factor w/ 400 levels "Akron","Allentown",..: 23 163 271 84 231 334 396 332 85 97 ...

There are 1000 records, let’s reduce the number of records by focusing on top 500 companies and check missing data again

data2018sub1 <- data2018sub[1:500,] #load only 500 records to a sub-list
length(data2018sub1[!complete.cases(data2018sub1),]) #check number of missing records

## [1] 10

Exploring the dataset for missing records

Let’s pull out all records with missing data in specific columns in “revenue”" and “revenue change” columns

data2018sub1[is.na(data2018sub1$revenue),] # returns 0 rows => no missing data in "revenue" columns

##  [1] rank          company       revenue       revenueChange profit       
##  [6] profitChange  asset         staff         sector        city         
## <0 rows> (or 0-length row.names)

data2018sub1[is.na(data2018sub1$revenueChange),] #2 rows: "Fortive" amd "Conduent"

##     rank  company revenue revenueChange profit profitChange asset staff
## 420  420  Fortive    6656            NA 1044.5           NA 10501 26000
## 466  466 Conduent    6022            NA  181.0           NA  7548 90000
##                sector         city
## 420       Industrials      Everett
## 466 Business Services Florham Park

let’s pull out the records with missing data in “profit” column

data2018sub1[is.na(data2018sub1$profit), "company"] #"Peabody Energy"

## [1] "Peabody Energy"

data2018sub1[data2018sub1$company=="Peabody Energy", "sector"] #"Energy"

## [1] Energy
## 21 Levels: Aerospace &  Defense Apparel Business Services ... Wholesalers

data2018sub1[data2018sub1$sector=="Energy", "company"]#59 companies

##  [1] "Exxon Mobil"                     "Chevron"                        
##  [3] "Phillips 66"                     "Valero Energy"                  
##  [5] "Marathon Petroleum"              "Energy Transfer Equity"         
##  [7] "Andeavor"                        "World Fuel Services"            
##  [9] "Exelon"                          "ConocoPhillips"                 
## [11] "Enterprise Products Partners"    "Plains GP Holdings"             
## [13] "Duke Energy"                     "Southern"                       
## [15] "PBF Energy"                      "Halliburton"                    
## [17] "NextEra Energy"                  "PG& E Corp."                    
## [19] "Freeport-McMoRan"                "American Electric Power"        
## [21] "HollyFrontier"                   "Devon Energy"                   
## [23] "AES"                             "Kinder Morgan"                  
## [25] "FirstEnergy"                     "Occidental Petroleum"           
## [27] "NGL Energy Partners"             "DTE Energy"                     
## [29] "Dominion Energy"                 "Edison International"           
## [31] "Oneok"                           "Consolidated Edison"            
## [33] "Anadarko Petroleum"              "Xcel Energy"                    
## [35] "NRG Energy"                      "EOG Resources"                  
## [37] "Sempra Energy"                   "Entergy"                        
## [39] "CenterPoint Energy"              "Chesapeake Energy"              
## [41] "Public Service Enterprise Group" "Targa Resources"                
## [43] "Calpine"                         "DCP Midstream"                  
## [45] "Williams"                        "Eversource Energy"              
## [47] "WEC Energy Group"                "PPL"                            
## [49] "Delek US Holdings"               "Newmont Mining"                 
## [51] "National Oilwell Varco"          "CMS Energy"                     
## [53] "Apache"                          "Ameren"                         
## [55] "UGI"                             "Cheniere Energy"                
## [57] "Peabody Energy"                  "Pioneer Natural Resources"      
## [59] "Vistra Energy"

median(data2018sub1[data2018sub1$sector=="Energy", "profit"], na.rm=TRUE) # median profit of company in  energy sector is $870m.

## [1] 870

data2018sub1[data2018sub1$company=="Peabody Energy", "profit"] <- median(data2018sub1[data2018sub1$sector=="Energy", "profit"], na.rm=TRUE)#using median value of profit in energy sector to replace the missing profit in "Peabody Energy"

data2018sub1[data2018sub1$company=="Peabody Energy", "profit"] #confirm that the replacement is successful

## [1] 870

let’s pull out the records with missing data in “asset” , “staff” columns

data2018sub1[is.na(data2018sub1$asset), "company"] # 0 => no missing records in "asset" coulmn

## character(0)

data2018sub1[is.na(data2018sub1$staff), "company"] #0 => no missing records in "staff" column

## character(0)

Actually, we don’t really need the revenueChange and profitChange columns, so let’s remove them

data2018sub1$revenueChange <- NULL
data2018sub1$profitChange <- NULL
str(data2018sub1) #confirm that these columns have been removed

## 'data.frame':    500 obs. of  8 variables:
##  $ rank   : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ company: chr  "Walmart" "Exxon Mobil" "Berkshire Hathaway" "Apple" ...
##  $ revenue: num  500343 244363 242137 229234 201159 ...
##  $ profit : num  9862 19710 44940 48351 10558 ...
##  $ asset  : num  204522 348691 702095 375319 139058 ...
##  $ staff  : num  2300000 71200 377000 123000 260000 64500 203000 566000 254000 180000 ...
##  $ sector : Factor w/ 21 levels "Aerospace &  Defense",..: 17 5 7 18 10 21 10 17 19 16 ...
##  $ city   : Factor w/ 400 levels "Akron","Allentown",..: 23 163 271 84 231 334 396 332 85 97 ...

Check for any other missing data

data2018sub1[!complete.cases(data2018sub1),]

##     rank         company revenue profit asset staff         sector city
## 402  402 JetBlue Airways    7015   1147  9781 17424 Transportation <NA>

The city of “JetBlue Airways” is missing. A search in the web confirms that the city should be “New York”. Let’s insert this into the record.

data2018sub1[data2018sub1$company=="JetBlue Airways", "city"] <- "New York"
data2018sub1[data2018sub1$company=="JetBlue Airways", "city"]

## [1] New York
## 400 Levels: Akron Allentown Ames Andover Ankeny ... Zeeland

Check for any other missing data (if any)

data2018sub1[is.na(data2018sub1),] #we can also use data2018sub1[!complete.cases(data2018sub1),]

## [1] rank    company revenue profit  asset   staff   sector  city   
## <0 rows> (or 0-length row.names)

Confirmed that no further missing records.

Cleaning up the fortune-500 2018 dataset

Andy Tai

9 August 2018

Intialisation

Exploring the dataset for missing records