#Purpose of this exercise is to explore functions in R
#related to data preparation
#set working directory
setwd("C:/Users/Hai/Downloads")
#load data set -- Data Set contains made up financial data of
#made up companies -- replace blanks with NA
fin <- read.csv("P3-Future-500-The-dataset.csv", na.strings=c(""))
#show first 6 rows of data set
head(fin)
## ID Name Industry Inception Employees State City
## 1 1 Over-Hex Software 2006 25 TN Franklin
## 2 2 Unimattax IT Services 2009 36 PA Newtown Square
## 3 3 Greenfax Retail 2012 NA SC Greenville
## 4 4 Blacklane IT Services 2011 66 CA Orange
## 5 5 Yearflex Software 2013 45 WI Madison
## 6 6 Indigoplanet IT Services 2013 60 NJ Manalapan
## Revenue Expenses Profit Growth
## 1 $9,684,527 1,130,700 Dollars 8553827 19%
## 2 $14,016,543 804,035 Dollars 13212508 20%
## 3 $9,746,272 1,044,375 Dollars 8701897 16%
## 4 $15,359,369 4,631,808 Dollars 10727561 19%
## 5 $8,567,910 4,374,841 Dollars 4193069 19%
## 6 $12,805,452 4,626,275 Dollars 8179177 22%
#show structure of data set
str(fin)
## 'data.frame': 500 obs. of 11 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Name : chr "Over-Hex" "Unimattax" "Greenfax" "Blacklane" ...
## $ Industry : chr "Software" "IT Services" "Retail" "IT Services" ...
## $ Inception: int 2006 2009 2012 2011 2013 2013 2009 2013 2009 2010 ...
## $ Employees: int 25 36 NA 66 45 60 116 73 55 25 ...
## $ State : chr "TN" "PA" "SC" "CA" ...
## $ City : chr "Franklin" "Newtown Square" "Greenville" "Orange" ...
## $ Revenue : chr "$9,684,527" "$14,016,543" "$9,746,272" "$15,359,369" ...
## $ Expenses : chr "1,130,700 Dollars" "804,035 Dollars" "1,044,375 Dollars" "4,631,808 Dollars" ...
## $ Profit : int 8553827 13212508 8701897 10727561 4193069 8179177 3259485 NA 5274553 11412916 ...
## $ Growth : chr "19%" "20%" "16%" "19%" ...
#convert the following variables to factors
fin$ID <- factor(fin$ID)
fin$Inception <- factor(fin$Inception)
#convert the character strings in expenses
#and revenue to a numeric value
fin$Expenses <- gsub(" Dollars","",fin$Expenses)
fin$Expenses <- gsub(",","",fin$Expenses)
fin$Revenue <- gsub("\\$","",fin$Revenue)
fin$Revenue <- gsub(",","",fin$Revenue)
fin$Growth <- gsub("%","",fin$Growth)
fin$Expenses <- as.numeric(fin$Expenses)
fin$Revenue <- as.numeric(fin$Revenue)
fin$Growth <- as.numeric(fin$Growth)
#show structure of data set after changes
str(fin)
## 'data.frame': 500 obs. of 11 variables:
## $ ID : Factor w/ 500 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Name : chr "Over-Hex" "Unimattax" "Greenfax" "Blacklane" ...
## $ Industry : chr "Software" "IT Services" "Retail" "IT Services" ...
## $ Inception: Factor w/ 16 levels "1999","2000",..: 8 11 14 13 15 15 11 15 11 12 ...
## $ Employees: int 25 36 NA 66 45 60 116 73 55 25 ...
## $ State : chr "TN" "PA" "SC" "CA" ...
## $ City : chr "Franklin" "Newtown Square" "Greenville" "Orange" ...
## $ Revenue : num 9684527 14016543 9746272 15359369 8567910 ...
## $ Expenses : num 1130700 804035 1044375 4631808 4374841 ...
## $ Profit : int 8553827 13212508 8701897 10727561 4193069 8179177 3259485 NA 5274553 11412916 ...
## $ Growth : num 19 20 16 19 19 22 17 NA 30 20 ...
#identify missing values
fin[!complete.cases(fin),]
## ID Name Industry Inception Employees State
## 3 3 Greenfax Retail 2012 NA SC
## 8 8 Rednimdox Construction 2013 73 NY
## 11 11 Canecorporation Health 2012 6 <NA>
## 14 14 Techline <NA> 2006 65 CA
## 15 15 Cityace <NA> 2010 25 CO
## 17 17 Ganzlax IT Services 2011 75 NJ
## 22 22 Lathotline Health <NA> 103 VA
## 44 44 Ganzgreen Construction 2010 224 TN
## 84 84 Drilldrill Software 2010 30 <NA>
## 267 267 Circlechop Software 2010 14 <NA>
## 332 332 Westminster Financial Services 2010 NA MI
## 379 379 Stovepuck Retail 2013 73 <NA>
## City Revenue Expenses Profit Growth
## 3 Greenville 9746272 1044375 8701897 16
## 8 Woodside NA NA NA NA
## 11 New York 10597009 7591189 3005820 7
## 14 San Ramon 13898119 5470303 8427816 23
## 15 Louisville 9254614 6249498 3005116 6
## 17 Iselin 14001180 NA 11901180 18
## 22 McLean 9418303 7567233 1851070 2
## 44 Franklin NA NA NA 9
## 84 San Francisco 7800620 2785799 5014821 17
## 267 San Francisco 9067070 5929828 3137242 20
## 332 Troy 11861652 5245126 6616526 15
## 379 New York 13814975 5904502 7910473 10
#remove row data of industries with NA since it will be an
#important variable in our analysis and we don't have
#a method to replace the missing values
fin <- fin[!is.na(fin$Industry),]
#show remaining rows of data with NA values
fin[!complete.cases(fin),]
## ID Name Industry Inception Employees State
## 3 3 Greenfax Retail 2012 NA SC
## 8 8 Rednimdox Construction 2013 73 NY
## 11 11 Canecorporation Health 2012 6 <NA>
## 17 17 Ganzlax IT Services 2011 75 NJ
## 22 22 Lathotline Health <NA> 103 VA
## 44 44 Ganzgreen Construction 2010 224 TN
## 84 84 Drilldrill Software 2010 30 <NA>
## 267 267 Circlechop Software 2010 14 <NA>
## 332 332 Westminster Financial Services 2010 NA MI
## 379 379 Stovepuck Retail 2013 73 <NA>
## City Revenue Expenses Profit Growth
## 3 Greenville 9746272 1044375 8701897 16
## 8 Woodside NA NA NA NA
## 11 New York 10597009 7591189 3005820 7
## 17 Iselin 14001180 NA 11901180 18
## 22 McLean 9418303 7567233 1851070 2
## 44 Franklin NA NA NA 9
## 84 San Francisco 7800620 2785799 5014821 17
## 267 San Francisco 9067070 5929828 3137242 20
## 332 Troy 11861652 5245126 6616526 15
## 379 New York 13814975 5904502 7910473 10
#resetting the data frame index
#rownames(fin) <- NULL would also achieve the same result
rownames(fin) <- 1:nrow(fin)
#show last 6 values of data set -- note the row number vs ID
tail(fin)
## ID Name Industry Inception Employees State
## 493 495 Rawfishcomplete Financial Services 2012 124 CA
## 494 496 Buretteadmirable IT Services 2009 93 ME
## 495 497 Inventtremendous Construction 2009 24 MN
## 496 498 Overviewparrot Retail 2011 7125 TX
## 497 499 Belaguerra IT Services 2010 140 MI
## 498 500 Allpossible IT Services 2011 24 CA
## City Revenue Expenses Profit Growth
## 493 Los Angeles 10624949 2951178 7673771 22
## 494 Portland 15407450 2833136 12574314 25
## 495 Woodbury 9144857 4755995 4388862 11
## 496 Fort Worth 11134728 5152110 5982618 12
## 497 Troy 17387130 1387784 15999346 23
## 498 Los Angeles 11949706 689161 11260545 24
#show remaining rows of data with NA values
fin[!complete.cases(fin),]
## ID Name Industry Inception Employees State
## 3 3 Greenfax Retail 2012 NA SC
## 8 8 Rednimdox Construction 2013 73 NY
## 11 11 Canecorporation Health 2012 6 <NA>
## 15 17 Ganzlax IT Services 2011 75 NJ
## 20 22 Lathotline Health <NA> 103 VA
## 42 44 Ganzgreen Construction 2010 224 TN
## 82 84 Drilldrill Software 2010 30 <NA>
## 265 267 Circlechop Software 2010 14 <NA>
## 330 332 Westminster Financial Services 2010 NA MI
## 377 379 Stovepuck Retail 2013 73 <NA>
## City Revenue Expenses Profit Growth
## 3 Greenville 9746272 1044375 8701897 16
## 8 Woodside NA NA NA NA
## 11 New York 10597009 7591189 3005820 7
## 15 Iselin 14001180 NA 11901180 18
## 20 McLean 9418303 7567233 1851070 2
## 42 Franklin NA NA NA 9
## 82 San Francisco 7800620 2785799 5014821 17
## 265 San Francisco 9067070 5929828 3137242 20
## 330 Troy 11861652 5245126 6616526 15
## 377 New York 13814975 5904502 7910473 10
#from the subset of rows with missing values, let's look at
#states with NA
fin[is.na(fin$State),]
## ID Name Industry Inception Employees State City
## 11 11 Canecorporation Health 2012 6 <NA> New York
## 82 84 Drilldrill Software 2010 30 <NA> San Francisco
## 265 267 Circlechop Software 2010 14 <NA> San Francisco
## 377 379 Stovepuck Retail 2013 73 <NA> New York
## Revenue Expenses Profit Growth
## 11 10597009 7591189 3005820 7
## 82 7800620 2785799 5014821 17
## 265 9067070 5929828 3137242 20
## 377 13814975 5904502 7910473 10
#since we know the missing values of state, we will replace them manually
fin[is.na(fin$State) & fin$City == "New York","State"] <- "NY"
fin[is.na(fin$State) & fin$City == "San Francisco","State"] <- "CA"
#show table with newly replaced values
fin[c(11,82,265,377),]
## ID Name Industry Inception Employees State City
## 11 11 Canecorporation Health 2012 6 NY New York
## 82 84 Drilldrill Software 2010 30 CA San Francisco
## 265 267 Circlechop Software 2010 14 CA San Francisco
## 377 379 Stovepuck Retail 2013 73 NY New York
## Revenue Expenses Profit Growth
## 11 10597009 7591189 3005820 7
## 82 7800620 2785799 5014821 17
## 265 9067070 5929828 3137242 20
## 377 13814975 5904502 7910473 10
#show remaining rows of data with NA values
fin[!complete.cases(fin),]
## ID Name Industry Inception Employees State City
## 3 3 Greenfax Retail 2012 NA SC Greenville
## 8 8 Rednimdox Construction 2013 73 NY Woodside
## 15 17 Ganzlax IT Services 2011 75 NJ Iselin
## 20 22 Lathotline Health <NA> 103 VA McLean
## 42 44 Ganzgreen Construction 2010 224 TN Franklin
## 330 332 Westminster Financial Services 2010 NA MI Troy
## Revenue Expenses Profit Growth
## 3 9746272 1044375 8701897 16
## 8 NA NA NA NA
## 15 14001180 NA 11901180 18
## 20 9418303 7567233 1851070 2
## 42 NA NA NA 9
## 330 11861652 5245126 6616526 15
#replacing missing values : Median Imputation Method
#calc medians for missing variables by Industry
median_employees_retail <- median(fin[fin$Industry == "Retail","Employees"], na.rm=TRUE)
median_employees_fs <- median(fin[fin$Industry == "Financial Services","Employees"], na.rm=TRUE)
median_growth_construction <- median(fin[fin$Industry == "Construction","Growth"], na.rm=TRUE)
median_revenue_construction <- median(fin[fin$Industry == "Construction","Revenue"], na.rm=TRUE)
median_expenses_construction <- median(fin[fin$Industry == "Construction","Expenses"], na.rm=TRUE)
#replace NA values with median
fin[is.na(fin$Employees) & fin$Industry == "Retail","Employees"] <- median_employees_retail
fin[is.na(fin$Employees) & fin$Industry == "Financial Services","Employees"] <- median_employees_fs
fin[is.na(fin$Growth) & fin$Industry == "Construction","Growth"] <- median_growth_construction
fin[is.na(fin$Revenue) & fin$Industry == "Construction","Revenue"] <- median_revenue_construction
fin[is.na(fin$Expenses) & fin$Industry == "Construction","Expenses"] <- median_expenses_construction
#show table with newly replaced values
fin[c(3,330),]
## ID Name Industry Inception Employees State City
## 3 3 Greenfax Retail 2012 28 SC Greenville
## 330 332 Westminster Financial Services 2010 80 MI Troy
## Revenue Expenses Profit Growth
## 3 9746272 1044375 8701897 16
## 330 11861652 5245126 6616526 15
#show remaining rows of data with NA values
fin[!complete.cases(fin),]
## ID Name Industry Inception Employees State City Revenue
## 8 8 Rednimdox Construction 2013 73 NY Woodside 9055059
## 15 17 Ganzlax IT Services 2011 75 NJ Iselin 14001180
## 20 22 Lathotline Health <NA> 103 VA McLean 9418303
## 42 44 Ganzgreen Construction 2010 224 TN Franklin 9055059
## Expenses Profit Growth
## 8 4506976 NA 10
## 15 NA 11901180 18
## 20 7567233 1851070 2
## 42 4506976 NA 9
#replacing mising values: derived values
#revenue - expenses = profit
#replace NA values using formula
fin[is.na(fin$Profit),"Profit"] <- fin[is.na(fin$Profit),"Revenue"] - fin[is.na(fin$Profit),"Expenses"]
fin[is.na(fin$Expenses),"Expenses"] <- fin[is.na(fin$Expenses),"Revenue"] - fin[is.na(fin$Expenses),"Profit"]
#show table of newly replaced values
fin[c(8,15,42),]
## ID Name Industry Inception Employees State City Revenue
## 8 8 Rednimdox Construction 2013 73 NY Woodside 9055059
## 15 17 Ganzlax IT Services 2011 75 NJ Iselin 14001180
## 42 44 Ganzgreen Construction 2010 224 TN Franklin 9055059
## Expenses Profit Growth
## 8 4506976 4548083 10
## 15 2100000 11901180 18
## 42 4506976 4548083 9
#show remaining rows of data with NA values
#empty value in Inception will not affect our analysis so
#we will not replace this value
fin[!complete.cases(fin),]
## ID Name Industry Inception Employees State City Revenue Expenses
## 20 22 Lathotline Health <NA> 103 VA McLean 9418303 7567233
## Profit Growth
## 20 1851070 2
#Visualizations
library(ggplot2)
library(scales)
#Scatterplot classified by Industry, Profit showing revenue, expenses
ggplot(data=fin) +
geom_point(aes(x=Revenue, y=Expenses, color=Industry,
size=Profit)) +
ggtitle("Revenue vs Expense by Industry, Profit") +
theme(plot.title = element_text(hjust=0.5)) +
scale_y_continuous(labels = function(y) y/1000000) +
scale_x_continuous(labels = function(x) x/1000000) +
scale_size_continuous(labels = function(size) size/1000000) +
xlab("Revenue (M)") + ylab("Expenses (M)") + labs(color="Industry", size="Profit (M)")

#Scatterplot that also includes trends for the
ggplot(data=fin, aes(x=Revenue, y=Expenses, color=Industry)) +
geom_point() + geom_smooth(fill=NA, size=1.2) +
ggtitle("Revenue vs Expense by Industry") +
theme(plot.title = element_text(hjust=0.5)) +
coord_cartesian(ylim=c(0,10000000)) +
scale_y_continuous(labels = function(y) y/1000000) +
scale_x_continuous(labels = function(x) x/1000000) +
scale_size_continuous(labels = function(size) size/1000000) +
xlab("Revenue (M)") + ylab("Expenses (M)")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#Boxplot
ggplot(data=fin, aes(x=Industry, y=Growth, color=Industry)) +
geom_jitter() + geom_boxplot(size=1, alpha=0.5, outlier.color=NA) +
ggtitle("Growth by Industry Boxplot") +
theme(axis.text.x=element_blank(),
plot.title = element_text(hjust=0.5))
