#Purpose of this exercise is to explore functions in R
#related to data preparation

#set working directory
setwd("C:/Users/Hai/Downloads")

#load data set -- Data Set contains made up financial data of
#made up companies -- replace blanks with NA
fin <- read.csv("P3-Future-500-The-dataset.csv", na.strings=c(""))

#show first 6 rows of data set
head(fin)
##   ID         Name    Industry Inception Employees State           City
## 1  1     Over-Hex    Software      2006        25    TN       Franklin
## 2  2    Unimattax IT Services      2009        36    PA Newtown Square
## 3  3     Greenfax      Retail      2012        NA    SC     Greenville
## 4  4    Blacklane IT Services      2011        66    CA         Orange
## 5  5     Yearflex    Software      2013        45    WI        Madison
## 6  6 Indigoplanet IT Services      2013        60    NJ      Manalapan
##       Revenue          Expenses   Profit Growth
## 1  $9,684,527 1,130,700 Dollars  8553827    19%
## 2 $14,016,543   804,035 Dollars 13212508    20%
## 3  $9,746,272 1,044,375 Dollars  8701897    16%
## 4 $15,359,369 4,631,808 Dollars 10727561    19%
## 5  $8,567,910 4,374,841 Dollars  4193069    19%
## 6 $12,805,452 4,626,275 Dollars  8179177    22%
#show structure of data set
str(fin)
## 'data.frame':    500 obs. of  11 variables:
##  $ ID       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Name     : chr  "Over-Hex" "Unimattax" "Greenfax" "Blacklane" ...
##  $ Industry : chr  "Software" "IT Services" "Retail" "IT Services" ...
##  $ Inception: int  2006 2009 2012 2011 2013 2013 2009 2013 2009 2010 ...
##  $ Employees: int  25 36 NA 66 45 60 116 73 55 25 ...
##  $ State    : chr  "TN" "PA" "SC" "CA" ...
##  $ City     : chr  "Franklin" "Newtown Square" "Greenville" "Orange" ...
##  $ Revenue  : chr  "$9,684,527" "$14,016,543" "$9,746,272" "$15,359,369" ...
##  $ Expenses : chr  "1,130,700 Dollars" "804,035 Dollars" "1,044,375 Dollars" "4,631,808 Dollars" ...
##  $ Profit   : int  8553827 13212508 8701897 10727561 4193069 8179177 3259485 NA 5274553 11412916 ...
##  $ Growth   : chr  "19%" "20%" "16%" "19%" ...
#convert the following variables to factors
fin$ID <- factor(fin$ID)
fin$Inception <- factor(fin$Inception)

#convert the character strings in expenses 
#and revenue to a numeric value
fin$Expenses <- gsub(" Dollars","",fin$Expenses)
fin$Expenses <- gsub(",","",fin$Expenses)

fin$Revenue <- gsub("\\$","",fin$Revenue)
fin$Revenue <- gsub(",","",fin$Revenue)

fin$Growth <- gsub("%","",fin$Growth)

fin$Expenses <- as.numeric(fin$Expenses)
fin$Revenue <- as.numeric(fin$Revenue)
fin$Growth <- as.numeric(fin$Growth)

#show structure of data set after changes
str(fin)
## 'data.frame':    500 obs. of  11 variables:
##  $ ID       : Factor w/ 500 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Name     : chr  "Over-Hex" "Unimattax" "Greenfax" "Blacklane" ...
##  $ Industry : chr  "Software" "IT Services" "Retail" "IT Services" ...
##  $ Inception: Factor w/ 16 levels "1999","2000",..: 8 11 14 13 15 15 11 15 11 12 ...
##  $ Employees: int  25 36 NA 66 45 60 116 73 55 25 ...
##  $ State    : chr  "TN" "PA" "SC" "CA" ...
##  $ City     : chr  "Franklin" "Newtown Square" "Greenville" "Orange" ...
##  $ Revenue  : num  9684527 14016543 9746272 15359369 8567910 ...
##  $ Expenses : num  1130700 804035 1044375 4631808 4374841 ...
##  $ Profit   : int  8553827 13212508 8701897 10727561 4193069 8179177 3259485 NA 5274553 11412916 ...
##  $ Growth   : num  19 20 16 19 19 22 17 NA 30 20 ...
#identify missing values
fin[!complete.cases(fin),]
##      ID            Name           Industry Inception Employees State
## 3     3        Greenfax             Retail      2012        NA    SC
## 8     8       Rednimdox       Construction      2013        73    NY
## 11   11 Canecorporation             Health      2012         6  <NA>
## 14   14        Techline               <NA>      2006        65    CA
## 15   15         Cityace               <NA>      2010        25    CO
## 17   17         Ganzlax        IT Services      2011        75    NJ
## 22   22      Lathotline             Health      <NA>       103    VA
## 44   44       Ganzgreen       Construction      2010       224    TN
## 84   84      Drilldrill           Software      2010        30  <NA>
## 267 267      Circlechop           Software      2010        14  <NA>
## 332 332     Westminster Financial Services      2010        NA    MI
## 379 379       Stovepuck             Retail      2013        73  <NA>
##              City  Revenue Expenses   Profit Growth
## 3      Greenville  9746272  1044375  8701897     16
## 8        Woodside       NA       NA       NA     NA
## 11       New York 10597009  7591189  3005820      7
## 14      San Ramon 13898119  5470303  8427816     23
## 15     Louisville  9254614  6249498  3005116      6
## 17         Iselin 14001180       NA 11901180     18
## 22         McLean  9418303  7567233  1851070      2
## 44       Franklin       NA       NA       NA      9
## 84  San Francisco  7800620  2785799  5014821     17
## 267 San Francisco  9067070  5929828  3137242     20
## 332          Troy 11861652  5245126  6616526     15
## 379      New York 13814975  5904502  7910473     10
#remove row data of industries with NA since it will be an 
#important variable in our analysis and we don't have
#a method to replace the missing values
fin <- fin[!is.na(fin$Industry),]

#show remaining rows of data with NA values
fin[!complete.cases(fin),]
##      ID            Name           Industry Inception Employees State
## 3     3        Greenfax             Retail      2012        NA    SC
## 8     8       Rednimdox       Construction      2013        73    NY
## 11   11 Canecorporation             Health      2012         6  <NA>
## 17   17         Ganzlax        IT Services      2011        75    NJ
## 22   22      Lathotline             Health      <NA>       103    VA
## 44   44       Ganzgreen       Construction      2010       224    TN
## 84   84      Drilldrill           Software      2010        30  <NA>
## 267 267      Circlechop           Software      2010        14  <NA>
## 332 332     Westminster Financial Services      2010        NA    MI
## 379 379       Stovepuck             Retail      2013        73  <NA>
##              City  Revenue Expenses   Profit Growth
## 3      Greenville  9746272  1044375  8701897     16
## 8        Woodside       NA       NA       NA     NA
## 11       New York 10597009  7591189  3005820      7
## 17         Iselin 14001180       NA 11901180     18
## 22         McLean  9418303  7567233  1851070      2
## 44       Franklin       NA       NA       NA      9
## 84  San Francisco  7800620  2785799  5014821     17
## 267 San Francisco  9067070  5929828  3137242     20
## 332          Troy 11861652  5245126  6616526     15
## 379      New York 13814975  5904502  7910473     10
#resetting the data frame index
#rownames(fin) <- NULL would also achieve the same result
rownames(fin) <- 1:nrow(fin)

#show last 6 values of data set -- note the row number vs ID
tail(fin)
##      ID             Name           Industry Inception Employees State
## 493 495  Rawfishcomplete Financial Services      2012       124    CA
## 494 496 Buretteadmirable        IT Services      2009        93    ME
## 495 497 Inventtremendous       Construction      2009        24    MN
## 496 498   Overviewparrot             Retail      2011      7125    TX
## 497 499       Belaguerra        IT Services      2010       140    MI
## 498 500      Allpossible        IT Services      2011        24    CA
##            City  Revenue Expenses   Profit Growth
## 493 Los Angeles 10624949  2951178  7673771     22
## 494    Portland 15407450  2833136 12574314     25
## 495    Woodbury  9144857  4755995  4388862     11
## 496  Fort Worth 11134728  5152110  5982618     12
## 497        Troy 17387130  1387784 15999346     23
## 498 Los Angeles 11949706   689161 11260545     24
#show remaining rows of data with NA values
fin[!complete.cases(fin),]
##      ID            Name           Industry Inception Employees State
## 3     3        Greenfax             Retail      2012        NA    SC
## 8     8       Rednimdox       Construction      2013        73    NY
## 11   11 Canecorporation             Health      2012         6  <NA>
## 15   17         Ganzlax        IT Services      2011        75    NJ
## 20   22      Lathotline             Health      <NA>       103    VA
## 42   44       Ganzgreen       Construction      2010       224    TN
## 82   84      Drilldrill           Software      2010        30  <NA>
## 265 267      Circlechop           Software      2010        14  <NA>
## 330 332     Westminster Financial Services      2010        NA    MI
## 377 379       Stovepuck             Retail      2013        73  <NA>
##              City  Revenue Expenses   Profit Growth
## 3      Greenville  9746272  1044375  8701897     16
## 8        Woodside       NA       NA       NA     NA
## 11       New York 10597009  7591189  3005820      7
## 15         Iselin 14001180       NA 11901180     18
## 20         McLean  9418303  7567233  1851070      2
## 42       Franklin       NA       NA       NA      9
## 82  San Francisco  7800620  2785799  5014821     17
## 265 San Francisco  9067070  5929828  3137242     20
## 330          Troy 11861652  5245126  6616526     15
## 377      New York 13814975  5904502  7910473     10
#from the subset of rows with missing values, let's look at
#states with NA
fin[is.na(fin$State),]
##      ID            Name Industry Inception Employees State          City
## 11   11 Canecorporation   Health      2012         6  <NA>      New York
## 82   84      Drilldrill Software      2010        30  <NA> San Francisco
## 265 267      Circlechop Software      2010        14  <NA> San Francisco
## 377 379       Stovepuck   Retail      2013        73  <NA>      New York
##      Revenue Expenses  Profit Growth
## 11  10597009  7591189 3005820      7
## 82   7800620  2785799 5014821     17
## 265  9067070  5929828 3137242     20
## 377 13814975  5904502 7910473     10
#since we know the missing values of state, we will replace them manually
fin[is.na(fin$State) & fin$City == "New York","State"] <- "NY"
fin[is.na(fin$State) & fin$City == "San Francisco","State"] <- "CA"

#show table with newly replaced values
fin[c(11,82,265,377),]
##      ID            Name Industry Inception Employees State          City
## 11   11 Canecorporation   Health      2012         6    NY      New York
## 82   84      Drilldrill Software      2010        30    CA San Francisco
## 265 267      Circlechop Software      2010        14    CA San Francisco
## 377 379       Stovepuck   Retail      2013        73    NY      New York
##      Revenue Expenses  Profit Growth
## 11  10597009  7591189 3005820      7
## 82   7800620  2785799 5014821     17
## 265  9067070  5929828 3137242     20
## 377 13814975  5904502 7910473     10
#show remaining rows of data with NA values
fin[!complete.cases(fin),]
##      ID        Name           Industry Inception Employees State       City
## 3     3    Greenfax             Retail      2012        NA    SC Greenville
## 8     8   Rednimdox       Construction      2013        73    NY   Woodside
## 15   17     Ganzlax        IT Services      2011        75    NJ     Iselin
## 20   22  Lathotline             Health      <NA>       103    VA     McLean
## 42   44   Ganzgreen       Construction      2010       224    TN   Franklin
## 330 332 Westminster Financial Services      2010        NA    MI       Troy
##      Revenue Expenses   Profit Growth
## 3    9746272  1044375  8701897     16
## 8         NA       NA       NA     NA
## 15  14001180       NA 11901180     18
## 20   9418303  7567233  1851070      2
## 42        NA       NA       NA      9
## 330 11861652  5245126  6616526     15
#replacing missing values : Median Imputation Method

#calc medians for missing variables by Industry
median_employees_retail <- median(fin[fin$Industry == "Retail","Employees"], na.rm=TRUE)
median_employees_fs <- median(fin[fin$Industry == "Financial Services","Employees"], na.rm=TRUE)
median_growth_construction <- median(fin[fin$Industry == "Construction","Growth"], na.rm=TRUE)
median_revenue_construction <- median(fin[fin$Industry == "Construction","Revenue"], na.rm=TRUE)
median_expenses_construction <- median(fin[fin$Industry == "Construction","Expenses"], na.rm=TRUE)

#replace NA values with median
fin[is.na(fin$Employees) & fin$Industry == "Retail","Employees"] <- median_employees_retail
fin[is.na(fin$Employees) & fin$Industry == "Financial Services","Employees"] <- median_employees_fs
fin[is.na(fin$Growth) & fin$Industry == "Construction","Growth"] <- median_growth_construction
fin[is.na(fin$Revenue) & fin$Industry == "Construction","Revenue"] <- median_revenue_construction
fin[is.na(fin$Expenses) & fin$Industry == "Construction","Expenses"] <- median_expenses_construction

#show table with newly replaced values
fin[c(3,330),]
##      ID        Name           Industry Inception Employees State       City
## 3     3    Greenfax             Retail      2012        28    SC Greenville
## 330 332 Westminster Financial Services      2010        80    MI       Troy
##      Revenue Expenses  Profit Growth
## 3    9746272  1044375 8701897     16
## 330 11861652  5245126 6616526     15
#show remaining rows of data with NA values
fin[!complete.cases(fin),]
##    ID       Name     Industry Inception Employees State     City  Revenue
## 8   8  Rednimdox Construction      2013        73    NY Woodside  9055059
## 15 17    Ganzlax  IT Services      2011        75    NJ   Iselin 14001180
## 20 22 Lathotline       Health      <NA>       103    VA   McLean  9418303
## 42 44  Ganzgreen Construction      2010       224    TN Franklin  9055059
##    Expenses   Profit Growth
## 8   4506976       NA     10
## 15       NA 11901180     18
## 20  7567233  1851070      2
## 42  4506976       NA      9
#replacing mising values: derived values
#revenue - expenses = profit

#replace NA values using formula
fin[is.na(fin$Profit),"Profit"] <- fin[is.na(fin$Profit),"Revenue"] - fin[is.na(fin$Profit),"Expenses"]
fin[is.na(fin$Expenses),"Expenses"] <- fin[is.na(fin$Expenses),"Revenue"] - fin[is.na(fin$Expenses),"Profit"]

#show table of newly replaced values
fin[c(8,15,42),]
##    ID      Name     Industry Inception Employees State     City  Revenue
## 8   8 Rednimdox Construction      2013        73    NY Woodside  9055059
## 15 17   Ganzlax  IT Services      2011        75    NJ   Iselin 14001180
## 42 44 Ganzgreen Construction      2010       224    TN Franklin  9055059
##    Expenses   Profit Growth
## 8   4506976  4548083     10
## 15  2100000 11901180     18
## 42  4506976  4548083      9
#show remaining rows of data with NA values
#empty value in Inception will not affect our analysis so
#we will not replace this value
fin[!complete.cases(fin),]
##    ID       Name Industry Inception Employees State   City Revenue Expenses
## 20 22 Lathotline   Health      <NA>       103    VA McLean 9418303  7567233
##     Profit Growth
## 20 1851070      2
#Visualizations
library(ggplot2)
library(scales)

#Scatterplot classified by Industry, Profit showing revenue, expenses
ggplot(data=fin) +
  geom_point(aes(x=Revenue, y=Expenses, color=Industry,
                 size=Profit)) +
  ggtitle("Revenue vs Expense by Industry, Profit") +
  theme(plot.title = element_text(hjust=0.5)) +
  scale_y_continuous(labels = function(y) y/1000000) +
  scale_x_continuous(labels = function(x) x/1000000) +
  scale_size_continuous(labels = function(size) size/1000000) +
  xlab("Revenue (M)") + ylab("Expenses (M)") + labs(color="Industry", size="Profit (M)")

#Scatterplot that also includes trends for the
ggplot(data=fin, aes(x=Revenue, y=Expenses, color=Industry)) +
  geom_point() + geom_smooth(fill=NA, size=1.2) +
  ggtitle("Revenue vs Expense by Industry") +
  theme(plot.title = element_text(hjust=0.5)) +
  coord_cartesian(ylim=c(0,10000000)) +
  scale_y_continuous(labels = function(y) y/1000000) +
  scale_x_continuous(labels = function(x) x/1000000) +
  scale_size_continuous(labels = function(size) size/1000000) +
  xlab("Revenue (M)") + ylab("Expenses (M)")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#Boxplot
ggplot(data=fin, aes(x=Industry, y=Growth, color=Industry)) +
         geom_jitter() + geom_boxplot(size=1, alpha=0.5, outlier.color=NA) +
    ggtitle("Growth by Industry Boxplot") +
    theme(axis.text.x=element_blank(),
          plot.title = element_text(hjust=0.5))