# Financial Review

# Data preprocessing

# Importing the dataset
fin <- read.csv('Future-500.csv', na.strings = c(""))
summary(fin)
##        ID                        Name                   Industry  
##  Min.   :  1.0   Abstractedchocolat:  1   IT Services       :146  
##  1st Qu.:125.8   Abusivebong       :  1   Health            : 86  
##  Median :250.5   Acclaimedcirl     :  1   Software          : 64  
##  Mean   :250.5   Admitruppell      :  1   Financial Services: 54  
##  3rd Qu.:375.2   Admonishbadelynge :  1   Construction      : 50  
##  Max.   :500.0   Ahemparticular    :  1   (Other)           : 98  
##                  (Other)           :494   NA's              :  2  
##    Inception      Employees           State              City    
##  Min.   :1999   Min.   :   1.00   CA     : 57   San Diego  : 13  
##  1st Qu.:2009   1st Qu.:  27.25   VA     : 50   New York   : 11  
##  Median :2011   Median :  56.00   TX     : 47   Reston     : 10  
##  Mean   :2010   Mean   : 148.61   FL     : 34   Houston    :  9  
##  3rd Qu.:2012   3rd Qu.: 126.00   MD     : 25   Austin     :  8  
##  Max.   :2014   Max.   :7125.00   (Other):283   Minneapolis:  8  
##  NA's   :1      NA's   :2         NA's   :  4   (Other)    :441  
##         Revenue                 Expenses       Profit        
##  $1,614,585 :  1   1,026,548 Dollars:  1   Min.   :   12434  
##  $1,835,717 :  1   1,040,662 Dollars:  1   1st Qu.: 3272074  
##  $10,064,297:  1   1,044,375 Dollars:  1   Median : 6513366  
##  $10,067,223:  1   1,097,353 Dollars:  1   Mean   : 6539474  
##  $10,072,452:  1   1,117,206 Dollars:  1   3rd Qu.: 9303951  
##  (Other)    :493   (Other)          :492   Max.   :19624534  
##  NA's       :  2   NA's             :  3   NA's   :2         
##      Growth   
##  20%    : 39  
##  19%    : 35  
##  17%    : 27  
##  6%     : 25  
##  12%    : 24  
##  (Other):349  
##  NA's   :  1
str(fin)
## 'data.frame':    500 obs. of  11 variables:
##  $ ID       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Name     : Factor w/ 500 levels "Abstractedchocolat",..: 297 451 168 40 485 199 435 339 242 395 ...
##  $ Industry : Factor w/ 7 levels "Construction",..: 7 5 6 5 7 5 2 1 5 2 ...
##  $ Inception: int  2006 2009 2012 2011 2013 2013 2009 2013 2009 2010 ...
##  $ Employees: int  25 36 NA 66 45 60 116 73 55 25 ...
##  $ State    : Factor w/ 42 levels "AL","AZ","CA",..: 36 33 35 3 41 27 22 29 3 8 ...
##  $ City     : Factor w/ 297 levels "Addison","Alexandria",..: 94 181 105 195 151 154 53 295 232 26 ...
##  $ Revenue  : Factor w/ 498 levels "$1,614,585","$1,835,717",..: 479 194 485 246 402 141 308 NA 96 117 ...
##  $ Expenses : Factor w/ 497 levels "1,026,548 Dollars",..: 6 485 3 248 227 247 57 NA 402 495 ...
##  $ Profit   : int  8553827 13212508 8701897 10727561 4193069 8179177 3259485 NA 5274553 11412916 ...
##  $ Growth   : Factor w/ 32 levels "-2%","-3%","0%",..: 14 16 11 14 14 18 12 NA 26 16 ...
# Changing from non factor to factor
fin$ID <- factor(fin$ID)
fin$Inception <- factor(fin$Inception)

# Factor Variable Trap (FVT)
 # Sub() and gsub()
 # Replace the first occurrence of a pattern with sub() or replace all occurrences with gsub()
fin$Expenses <- gsub(" Dollars","", fin$Expenses)
fin$Expenses <- gsub(",","", fin$Expenses)
fin$Revenue <- gsub("\\$", "", fin$Revenue)
fin$Revenue <- gsub(",","", fin$Revenue)
fin$Growth <- gsub("%","", fin$Growth)

fin$Expenses <- as.numeric(fin$Expenses)
fin$Revenue <- as.numeric(fin$Revenue)
fin$Growth <- as.numeric(fin$Growth)
head(fin)
##   ID         Name    Industry Inception Employees State           City
## 1  1     Over-Hex    Software      2006        25    TN       Franklin
## 2  2    Unimattax IT Services      2009        36    PA Newtown Square
## 3  3     Greenfax      Retail      2012        NA    SC     Greenville
## 4  4    Blacklane IT Services      2011        66    CA         Orange
## 5  5     Yearflex    Software      2013        45    WI        Madison
## 6  6 Indigoplanet IT Services      2013        60    NJ      Manalapan
##    Revenue Expenses   Profit Growth
## 1  9684527  1130700  8553827     19
## 2 14016543   804035 13212508     20
## 3  9746272  1044375  8701897     16
## 4 15359369  4631808 10727561     19
## 5  8567910  4374841  4193069     19
## 6 12805452  4626275  8179177     22
tail(fin)
##      ID             Name           Industry Inception Employees State
## 495 495  Rawfishcomplete Financial Services      2012       124    CA
## 496 496 Buretteadmirable        IT Services      2009        93    ME
## 497 497 Inventtremendous       Construction      2009        24    MN
## 498 498   Overviewparrot             Retail      2011      7125    TX
## 499 499       Belaguerra        IT Services      2010       140    MI
## 500 500      Allpossible        IT Services      2011        24    CA
##            City  Revenue Expenses   Profit Growth
## 495 Los Angeles 10624949  2951178  7673771     22
## 496    Portland 15407450  2833136 12574314     25
## 497    Woodbury  9144857  4755995  4388862     11
## 498  Fort Worth 11134728  5152110  5982618     12
## 499        Troy 17387130  1387784 15999346     23
## 500 Los Angeles 11949706   689161 11260545     24
# Locate Missing Data
fin[!complete.cases(fin),]
##      ID            Name           Industry Inception Employees State
## 3     3        Greenfax             Retail      2012        NA    SC
## 8     8       Rednimdox       Construction      2013        73    NY
## 11   11 Canecorporation             Health      2012         6  <NA>
## 14   14        Techline               <NA>      2006        65    CA
## 15   15         Cityace               <NA>      2010        25    CO
## 17   17         Ganzlax        IT Services      2011        75    NJ
## 22   22      Lathotline             Health      <NA>       103    VA
## 44   44       Ganzgreen       Construction      2010       224    TN
## 84   84      Drilldrill           Software      2010        30  <NA>
## 267 267      Circlechop           Software      2010        14  <NA>
## 332 332     Westminster Financial Services      2010        NA    MI
## 379 379       Stovepuck             Retail      2013        73  <NA>
##              City  Revenue Expenses   Profit Growth
## 3      Greenville  9746272  1044375  8701897     16
## 8        Woodside       NA       NA       NA     NA
## 11       New York 10597009  7591189  3005820      7
## 14      San Ramon 13898119  5470303  8427816     23
## 15     Louisville  9254614  6249498  3005116      6
## 17         Iselin 14001180       NA 11901180     18
## 22         McLean  9418303  7567233  1851070      2
## 44       Franklin       NA       NA       NA      9
## 84  San Francisco  7800620  2785799  5014821     17
## 267 San Francisco  9067070  5929828  3137242     20
## 332          Troy 11861652  5245126  6616526     15
## 379      New York 13814975  5904502  7910473     10
# Selecting data with non missing values
fin[which(fin$Employees == 45),]
##      ID           Name     Industry Inception Employees State
## 5     5       Yearflex     Software      2013        45    WI
## 137 137      Toughcare       Retail      2009        45    CA
## 183 183         Ittech  IT Services      2013        45    MN
## 200 200         Lalane       Retail      2003        45    MN
## 208 208  Countslovenly Construction      2010        45    FL
## 245 245  Peskyevaluate  IT Services      2010        45    VA
## 360 360 Remembergabbro Construction      2012        45    UT
## 380 380      Pickyfive  IT Services      2011        45    CO
## 435 435   Lucrepickled  IT Services      2004        45    VA
## 487 487       Genusequ Construction      2007        45    NC
##              City  Revenue Expenses   Profit Growth
## 5         Madison  8567910  4374841  4193069     19
## 137       Burbank 12429629  5796075  6633554     14
## 183   Minneapolis 11133739  6544488  4589251     20
## 200 Golden Valley 12461526  4934351  7527175     14
## 208   Spring Hill  8380367  8213905   166462     10
## 245      Richmond 13011611  4284410  8727201     23
## 360        Lindon 10878578  4515112  6363466     12
## 380        Denver 14826723  4458447 10368276     26
## 435    Glen Allen 12894933  3512395  9382538     17
## 487    Greensboro  8498464  5741773  2756691     11
# Selcting data with missing values using .na()
fin[is.na(fin$Revenue),]
##    ID      Name     Industry Inception Employees State     City Revenue
## 8   8 Rednimdox Construction      2013        73    NY Woodside      NA
## 44 44 Ganzgreen Construction      2010       224    TN Franklin      NA
##    Expenses Profit Growth
## 8        NA     NA     NA
## 44       NA     NA      9
# Removing records with missing values
fin_backup <- fin

is.na(fin$Industry)
##   [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [12] FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [23] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [34] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [45] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [56] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [67] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [78] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [89] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [100] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [111] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [122] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [144] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [155] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [166] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [177] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [188] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [199] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [210] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [221] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [232] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [243] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [254] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [265] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [276] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [287] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [298] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [309] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [320] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [331] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [342] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [353] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [364] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [375] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [386] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [397] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [408] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [419] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [430] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [441] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [452] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [463] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [474] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [485] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [496] FALSE FALSE FALSE FALSE FALSE
fin <- fin[!is.na(fin$Industry),]

# Resetting the dataframe index
rownames(fin) <- 1:nrow(fin)
 # OR the fastest way
rownames(fin) <- NULL

# Replacing missing data: Factual Analysis
fin[is.na(fin$State),]
##      ID            Name Industry Inception Employees State          City
## 11   11 Canecorporation   Health      2012         6  <NA>      New York
## 82   84      Drilldrill Software      2010        30  <NA> San Francisco
## 265 267      Circlechop Software      2010        14  <NA> San Francisco
## 377 379       Stovepuck   Retail      2013        73  <NA>      New York
##      Revenue Expenses  Profit Growth
## 11  10597009  7591189 3005820      7
## 82   7800620  2785799 5014821     17
## 265  9067070  5929828 3137242     20
## 377 13814975  5904502 7910473     10
fin[is.na(fin$State) & fin$City == "New York",]
##      ID            Name Industry Inception Employees State     City
## 11   11 Canecorporation   Health      2012         6  <NA> New York
## 377 379       Stovepuck   Retail      2013        73  <NA> New York
##      Revenue Expenses  Profit Growth
## 11  10597009  7591189 3005820      7
## 377 13814975  5904502 7910473     10
fin[is.na(fin$State) & fin$City == "New York", "State"] <- "NY"
fin[is.na(fin$State) & fin$City == "San Francisco", "State"] <- "CA"

fin[!complete.cases(fin),]
##      ID        Name           Industry Inception Employees State
## 3     3    Greenfax             Retail      2012        NA    SC
## 8     8   Rednimdox       Construction      2013        73    NY
## 15   17     Ganzlax        IT Services      2011        75    NJ
## 20   22  Lathotline             Health      <NA>       103    VA
## 42   44   Ganzgreen       Construction      2010       224    TN
## 330 332 Westminster Financial Services      2010        NA    MI
##           City  Revenue Expenses   Profit Growth
## 3   Greenville  9746272  1044375  8701897     16
## 8     Woodside       NA       NA       NA     NA
## 15      Iselin 14001180       NA 11901180     18
## 20      McLean  9418303  7567233  1851070      2
## 42    Franklin       NA       NA       NA      9
## 330       Troy 11861652  5245126  6616526     15
# Replacing Missing Data: Median Imputation Method
med_emp_retail <- median(fin[fin$Industry=="Retail", "Employees"],na.rm = T)
fin[is.na(fin$Employees) & fin$Industry == "Retail", "Employees"] <- med_emp_retail

med_emp_fin_sec <- median(fin[fin$Industry=="Financial Services", "Employees"], na.rm = T)
fin[is.na(fin$Employees) & fin$Industry == "Financial Services", "Employees"] <- med_emp_fin_sec

med_grwth <- median(fin[fin$Industry=="Construction","Growth"], na.rm = T)
fin[is.na(fin$Growth) & fin$Industry=="Construction","Growth"] <- med_grwth
 
  # dealing with multiple values at same time (single columns)
fin[!complete.cases(fin),]
##    ID       Name     Industry Inception Employees State     City  Revenue
## 8   8  Rednimdox Construction      2013        73    NY Woodside       NA
## 15 17    Ganzlax  IT Services      2011        75    NJ   Iselin 14001180
## 20 22 Lathotline       Health      <NA>       103    VA   McLean  9418303
## 42 44  Ganzgreen Construction      2010       224    TN Franklin       NA
##    Expenses   Profit Growth
## 8        NA       NA     10
## 15       NA 11901180     18
## 20  7567233  1851070      2
## 42       NA       NA      9
med_rev <- median(fin[fin$Industry == "Construction","Revenue"], na.rm = T )
fin[is.na(fin$Revenue) & fin$Industry == "Construction", "Revenue"]
## [1] NA NA
fin[is.na(fin$Revenue) & fin$Industry == "Construction", "Revenue"] <- med_rev

med_exp <- median(fin[fin$Industry == "Construction","Expenses"], na.rm = T )
fin[is.na(fin$Expenses) & fin$Industry == "Construction", "Expenses"]
## [1] NA NA
fin[is.na(fin$Expenses) & fin$Industry == "Construction" & is.na(fin$Profit), "Expenses"] <- med_exp

# Replacing Missing Data: Deriving Values Method
  # dealing with multiple values at same time (multiple columns)
fin[is.na(fin$Expenses) & !is.na(fin$Profit), "Expenses"] <- fin[is.na(fin$Expenses) & !is.na(fin$Profit), "Revenue" ] -
  fin[is.na(fin$Expenses) & !is.na(fin$Profit), "Profit" ] 

fin[is.na(fin$Profit), "Profit"] <- fin[is.na(fin$Profit), "Revenue" ] - fin[is.na(fin$Profit), "Expenses" ] 

# Visulization
#install.packages("ggplot2")
library(ggplot2)

# A scatterplot classified by insudstry showing revenue, expenses, profit
p <- ggplot(data = fin)
p + geom_point(aes(x = Revenue, y = Expenses,
                   color = Industry, size = Profit))

# A scatterplot that includes inductry trends for the expenses
d <-  ggplot(data = fin,aes(x = Revenue, y = Expenses,
                            color = Industry) )
d + geom_point() + geom_smooth(fill = NA, size = 1.2)
## `geom_smooth()` using method = 'loess'

# Boxplot
f<- ggplot(data = fin,aes(x = Industry, y = Growth, color = Industry))
f + geom_boxplot(size = 1)