# Financial Review
# Data preprocessing
# Importing the dataset
fin <- read.csv('Future-500.csv', na.strings = c(""))
summary(fin)
## ID Name Industry
## Min. : 1.0 Abstractedchocolat: 1 IT Services :146
## 1st Qu.:125.8 Abusivebong : 1 Health : 86
## Median :250.5 Acclaimedcirl : 1 Software : 64
## Mean :250.5 Admitruppell : 1 Financial Services: 54
## 3rd Qu.:375.2 Admonishbadelynge : 1 Construction : 50
## Max. :500.0 Ahemparticular : 1 (Other) : 98
## (Other) :494 NA's : 2
## Inception Employees State City
## Min. :1999 Min. : 1.00 CA : 57 San Diego : 13
## 1st Qu.:2009 1st Qu.: 27.25 VA : 50 New York : 11
## Median :2011 Median : 56.00 TX : 47 Reston : 10
## Mean :2010 Mean : 148.61 FL : 34 Houston : 9
## 3rd Qu.:2012 3rd Qu.: 126.00 MD : 25 Austin : 8
## Max. :2014 Max. :7125.00 (Other):283 Minneapolis: 8
## NA's :1 NA's :2 NA's : 4 (Other) :441
## Revenue Expenses Profit
## $1,614,585 : 1 1,026,548 Dollars: 1 Min. : 12434
## $1,835,717 : 1 1,040,662 Dollars: 1 1st Qu.: 3272074
## $10,064,297: 1 1,044,375 Dollars: 1 Median : 6513366
## $10,067,223: 1 1,097,353 Dollars: 1 Mean : 6539474
## $10,072,452: 1 1,117,206 Dollars: 1 3rd Qu.: 9303951
## (Other) :493 (Other) :492 Max. :19624534
## NA's : 2 NA's : 3 NA's :2
## Growth
## 20% : 39
## 19% : 35
## 17% : 27
## 6% : 25
## 12% : 24
## (Other):349
## NA's : 1
str(fin)
## 'data.frame': 500 obs. of 11 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Name : Factor w/ 500 levels "Abstractedchocolat",..: 297 451 168 40 485 199 435 339 242 395 ...
## $ Industry : Factor w/ 7 levels "Construction",..: 7 5 6 5 7 5 2 1 5 2 ...
## $ Inception: int 2006 2009 2012 2011 2013 2013 2009 2013 2009 2010 ...
## $ Employees: int 25 36 NA 66 45 60 116 73 55 25 ...
## $ State : Factor w/ 42 levels "AL","AZ","CA",..: 36 33 35 3 41 27 22 29 3 8 ...
## $ City : Factor w/ 297 levels "Addison","Alexandria",..: 94 181 105 195 151 154 53 295 232 26 ...
## $ Revenue : Factor w/ 498 levels "$1,614,585","$1,835,717",..: 479 194 485 246 402 141 308 NA 96 117 ...
## $ Expenses : Factor w/ 497 levels "1,026,548 Dollars",..: 6 485 3 248 227 247 57 NA 402 495 ...
## $ Profit : int 8553827 13212508 8701897 10727561 4193069 8179177 3259485 NA 5274553 11412916 ...
## $ Growth : Factor w/ 32 levels "-2%","-3%","0%",..: 14 16 11 14 14 18 12 NA 26 16 ...
# Changing from non factor to factor
fin$ID <- factor(fin$ID)
fin$Inception <- factor(fin$Inception)
# Factor Variable Trap (FVT)
# Sub() and gsub()
# Replace the first occurrence of a pattern with sub() or replace all occurrences with gsub()
fin$Expenses <- gsub(" Dollars","", fin$Expenses)
fin$Expenses <- gsub(",","", fin$Expenses)
fin$Revenue <- gsub("\\$", "", fin$Revenue)
fin$Revenue <- gsub(",","", fin$Revenue)
fin$Growth <- gsub("%","", fin$Growth)
fin$Expenses <- as.numeric(fin$Expenses)
fin$Revenue <- as.numeric(fin$Revenue)
fin$Growth <- as.numeric(fin$Growth)
head(fin)
## ID Name Industry Inception Employees State City
## 1 1 Over-Hex Software 2006 25 TN Franklin
## 2 2 Unimattax IT Services 2009 36 PA Newtown Square
## 3 3 Greenfax Retail 2012 NA SC Greenville
## 4 4 Blacklane IT Services 2011 66 CA Orange
## 5 5 Yearflex Software 2013 45 WI Madison
## 6 6 Indigoplanet IT Services 2013 60 NJ Manalapan
## Revenue Expenses Profit Growth
## 1 9684527 1130700 8553827 19
## 2 14016543 804035 13212508 20
## 3 9746272 1044375 8701897 16
## 4 15359369 4631808 10727561 19
## 5 8567910 4374841 4193069 19
## 6 12805452 4626275 8179177 22
tail(fin)
## ID Name Industry Inception Employees State
## 495 495 Rawfishcomplete Financial Services 2012 124 CA
## 496 496 Buretteadmirable IT Services 2009 93 ME
## 497 497 Inventtremendous Construction 2009 24 MN
## 498 498 Overviewparrot Retail 2011 7125 TX
## 499 499 Belaguerra IT Services 2010 140 MI
## 500 500 Allpossible IT Services 2011 24 CA
## City Revenue Expenses Profit Growth
## 495 Los Angeles 10624949 2951178 7673771 22
## 496 Portland 15407450 2833136 12574314 25
## 497 Woodbury 9144857 4755995 4388862 11
## 498 Fort Worth 11134728 5152110 5982618 12
## 499 Troy 17387130 1387784 15999346 23
## 500 Los Angeles 11949706 689161 11260545 24
# Locate Missing Data
fin[!complete.cases(fin),]
## ID Name Industry Inception Employees State
## 3 3 Greenfax Retail 2012 NA SC
## 8 8 Rednimdox Construction 2013 73 NY
## 11 11 Canecorporation Health 2012 6 <NA>
## 14 14 Techline <NA> 2006 65 CA
## 15 15 Cityace <NA> 2010 25 CO
## 17 17 Ganzlax IT Services 2011 75 NJ
## 22 22 Lathotline Health <NA> 103 VA
## 44 44 Ganzgreen Construction 2010 224 TN
## 84 84 Drilldrill Software 2010 30 <NA>
## 267 267 Circlechop Software 2010 14 <NA>
## 332 332 Westminster Financial Services 2010 NA MI
## 379 379 Stovepuck Retail 2013 73 <NA>
## City Revenue Expenses Profit Growth
## 3 Greenville 9746272 1044375 8701897 16
## 8 Woodside NA NA NA NA
## 11 New York 10597009 7591189 3005820 7
## 14 San Ramon 13898119 5470303 8427816 23
## 15 Louisville 9254614 6249498 3005116 6
## 17 Iselin 14001180 NA 11901180 18
## 22 McLean 9418303 7567233 1851070 2
## 44 Franklin NA NA NA 9
## 84 San Francisco 7800620 2785799 5014821 17
## 267 San Francisco 9067070 5929828 3137242 20
## 332 Troy 11861652 5245126 6616526 15
## 379 New York 13814975 5904502 7910473 10
# Selecting data with non missing values
fin[which(fin$Employees == 45),]
## ID Name Industry Inception Employees State
## 5 5 Yearflex Software 2013 45 WI
## 137 137 Toughcare Retail 2009 45 CA
## 183 183 Ittech IT Services 2013 45 MN
## 200 200 Lalane Retail 2003 45 MN
## 208 208 Countslovenly Construction 2010 45 FL
## 245 245 Peskyevaluate IT Services 2010 45 VA
## 360 360 Remembergabbro Construction 2012 45 UT
## 380 380 Pickyfive IT Services 2011 45 CO
## 435 435 Lucrepickled IT Services 2004 45 VA
## 487 487 Genusequ Construction 2007 45 NC
## City Revenue Expenses Profit Growth
## 5 Madison 8567910 4374841 4193069 19
## 137 Burbank 12429629 5796075 6633554 14
## 183 Minneapolis 11133739 6544488 4589251 20
## 200 Golden Valley 12461526 4934351 7527175 14
## 208 Spring Hill 8380367 8213905 166462 10
## 245 Richmond 13011611 4284410 8727201 23
## 360 Lindon 10878578 4515112 6363466 12
## 380 Denver 14826723 4458447 10368276 26
## 435 Glen Allen 12894933 3512395 9382538 17
## 487 Greensboro 8498464 5741773 2756691 11
# Selcting data with missing values using .na()
fin[is.na(fin$Revenue),]
## ID Name Industry Inception Employees State City Revenue
## 8 8 Rednimdox Construction 2013 73 NY Woodside NA
## 44 44 Ganzgreen Construction 2010 224 TN Franklin NA
## Expenses Profit Growth
## 8 NA NA NA
## 44 NA NA 9
# Removing records with missing values
fin_backup <- fin
is.na(fin$Industry)
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [23] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [34] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [45] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [56] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [67] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [78] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [89] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [100] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [111] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [122] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [144] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [155] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [166] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [177] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [188] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [199] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [210] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [221] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [232] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [243] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [254] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [265] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [276] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [287] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [298] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [309] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [320] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [331] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [342] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [353] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [364] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [375] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [386] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [397] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [408] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [419] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [430] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [441] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [452] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [463] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [474] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [485] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [496] FALSE FALSE FALSE FALSE FALSE
fin <- fin[!is.na(fin$Industry),]
# Resetting the dataframe index
rownames(fin) <- 1:nrow(fin)
# OR the fastest way
rownames(fin) <- NULL
# Replacing missing data: Factual Analysis
fin[is.na(fin$State),]
## ID Name Industry Inception Employees State City
## 11 11 Canecorporation Health 2012 6 <NA> New York
## 82 84 Drilldrill Software 2010 30 <NA> San Francisco
## 265 267 Circlechop Software 2010 14 <NA> San Francisco
## 377 379 Stovepuck Retail 2013 73 <NA> New York
## Revenue Expenses Profit Growth
## 11 10597009 7591189 3005820 7
## 82 7800620 2785799 5014821 17
## 265 9067070 5929828 3137242 20
## 377 13814975 5904502 7910473 10
fin[is.na(fin$State) & fin$City == "New York",]
## ID Name Industry Inception Employees State City
## 11 11 Canecorporation Health 2012 6 <NA> New York
## 377 379 Stovepuck Retail 2013 73 <NA> New York
## Revenue Expenses Profit Growth
## 11 10597009 7591189 3005820 7
## 377 13814975 5904502 7910473 10
fin[is.na(fin$State) & fin$City == "New York", "State"] <- "NY"
fin[is.na(fin$State) & fin$City == "San Francisco", "State"] <- "CA"
fin[!complete.cases(fin),]
## ID Name Industry Inception Employees State
## 3 3 Greenfax Retail 2012 NA SC
## 8 8 Rednimdox Construction 2013 73 NY
## 15 17 Ganzlax IT Services 2011 75 NJ
## 20 22 Lathotline Health <NA> 103 VA
## 42 44 Ganzgreen Construction 2010 224 TN
## 330 332 Westminster Financial Services 2010 NA MI
## City Revenue Expenses Profit Growth
## 3 Greenville 9746272 1044375 8701897 16
## 8 Woodside NA NA NA NA
## 15 Iselin 14001180 NA 11901180 18
## 20 McLean 9418303 7567233 1851070 2
## 42 Franklin NA NA NA 9
## 330 Troy 11861652 5245126 6616526 15
# Replacing Missing Data: Median Imputation Method
med_emp_retail <- median(fin[fin$Industry=="Retail", "Employees"],na.rm = T)
fin[is.na(fin$Employees) & fin$Industry == "Retail", "Employees"] <- med_emp_retail
med_emp_fin_sec <- median(fin[fin$Industry=="Financial Services", "Employees"], na.rm = T)
fin[is.na(fin$Employees) & fin$Industry == "Financial Services", "Employees"] <- med_emp_fin_sec
med_grwth <- median(fin[fin$Industry=="Construction","Growth"], na.rm = T)
fin[is.na(fin$Growth) & fin$Industry=="Construction","Growth"] <- med_grwth
# dealing with multiple values at same time (single columns)
fin[!complete.cases(fin),]
## ID Name Industry Inception Employees State City Revenue
## 8 8 Rednimdox Construction 2013 73 NY Woodside NA
## 15 17 Ganzlax IT Services 2011 75 NJ Iselin 14001180
## 20 22 Lathotline Health <NA> 103 VA McLean 9418303
## 42 44 Ganzgreen Construction 2010 224 TN Franklin NA
## Expenses Profit Growth
## 8 NA NA 10
## 15 NA 11901180 18
## 20 7567233 1851070 2
## 42 NA NA 9
med_rev <- median(fin[fin$Industry == "Construction","Revenue"], na.rm = T )
fin[is.na(fin$Revenue) & fin$Industry == "Construction", "Revenue"]
## [1] NA NA
fin[is.na(fin$Revenue) & fin$Industry == "Construction", "Revenue"] <- med_rev
med_exp <- median(fin[fin$Industry == "Construction","Expenses"], na.rm = T )
fin[is.na(fin$Expenses) & fin$Industry == "Construction", "Expenses"]
## [1] NA NA
fin[is.na(fin$Expenses) & fin$Industry == "Construction" & is.na(fin$Profit), "Expenses"] <- med_exp
# Replacing Missing Data: Deriving Values Method
# dealing with multiple values at same time (multiple columns)
fin[is.na(fin$Expenses) & !is.na(fin$Profit), "Expenses"] <- fin[is.na(fin$Expenses) & !is.na(fin$Profit), "Revenue" ] -
fin[is.na(fin$Expenses) & !is.na(fin$Profit), "Profit" ]
fin[is.na(fin$Profit), "Profit"] <- fin[is.na(fin$Profit), "Revenue" ] - fin[is.na(fin$Profit), "Expenses" ]
# Visulization
#install.packages("ggplot2")
library(ggplot2)
# A scatterplot classified by insudstry showing revenue, expenses, profit
p <- ggplot(data = fin)
p + geom_point(aes(x = Revenue, y = Expenses,
color = Industry, size = Profit))

# A scatterplot that includes inductry trends for the expenses
d <- ggplot(data = fin,aes(x = Revenue, y = Expenses,
color = Industry) )
d + geom_point() + geom_smooth(fill = NA, size = 1.2)
## `geom_smooth()` using method = 'loess'

# Boxplot
f<- ggplot(data = fin,aes(x = Industry, y = Growth, color = Industry))
f + geom_boxplot(size = 1)
