#Checkpoint 1: Data Cleaning 1
################################

#Load the companies and rounds data (provided on the next page) into two data frames and name them 
#companies and rounds2 respectively.
#setwd("C:\\mycomp\\practice\\datawarehousing\\OLAP-BI\\group_assignment")
companies<-read.delim("companies.txt",header = TRUE,stringsAsFactors = FALSE,sep="\t")
rounds2<-read.csv("rounds2.csv",header = TRUE,stringsAsFactors = FALSE)

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#checkpoint1
#How many unique companies are present in rounds2?

#convert the company_permalink to uppercase letters
rounds2$company_permalink<-toupper(rounds2$company_permalink)

#Get the list of unique companies from rounds2
rounds2_unique<-distinct(rounds2,company_permalink)
nrow(rounds2_unique)
## [1] 66368
#How many unique companies are present in companies?
#convert the column permalink to uppercase letters
companies$permalink<-toupper(companies$permalink)

#Get the list of unique companies in companies data frame
companies_unique<-distinct(companies,permalink)
nrow(companies_unique)
## [1] 66368
#In the companies data frame, which column can be used as the unique key for each company? Write the name of the column.
#Ans:permalink

#Are there any companies in the rounds2 file which are not present in companies? Answer yes or no: Y/N
#Find out if there are any companies which are in rounds2 but not in companies data frame.
companies_diff<-!(rounds2_unique$company_permalink %in% companies$permalink)
length(which(companies_diff=="TRUE"))
## [1] 0
#Since the above statement returns 0 we can conlude that there are no companies in rounds2 that are not in companies


#Merge the two data frames so that all  variables (columns)  in the companies frame are added to the 
#rounds2 data frame. Name the merged frame master_frame. How many observations are present in master_frame ?
#rename column before merge
names(rounds2)[1]<-paste("permalink")

#merge both companies and rounds2
master_frame<-merge(rounds2,companies,by="permalink")

nrow(master_frame)
## [1] 114949
#Checkpoint 2: Data Cleaning 2
#################################
#How many NA values are present in the column raised_amount_usd ?
length(which(is.na(master_frame$raised_amount_usd)))
## [1] 19990
#What do you replace NA values of raised_amount_usd  with? Enter a numeric value.
#Ans: 0

master_frame[which(is.na(master_frame$raised_amount_usd)),"raised_amount_usd"]<-0

#Checkpoint 3: Funding Type Analysis
#####################################

#To calculate the mean for each funding type first group by funding round type
group_by_funding_type<-group_by(master_frame,funding_round_type)

funding_types_mean<-summarise(group_by_funding_type,mean(raised_amount_usd))


#Average funding amount of venture type
funding_types_mean[funding_types_mean$funding_round_type=="venture",]
## # A tibble: 1 x 2
##   funding_round_type `mean(raised_amount_usd)`
##                <chr>                     <dbl>
## 1            venture                  10634054
#Average funding amount of angel type
funding_types_mean[funding_types_mean$funding_round_type=="angel",]
## # A tibble: 1 x 2
##   funding_round_type `mean(raised_amount_usd)`
##                <chr>                     <dbl>
## 1              angel                  764564.3
#Average funding amount of seed type
funding_types_mean[funding_types_mean$funding_round_type=="seed",]
## # A tibble: 1 x 2
##   funding_round_type `mean(raised_amount_usd)`
##                <chr>                     <dbl>
## 1               seed                  556606.7
#Average funding amount of private equity type
funding_types_mean[funding_types_mean$funding_round_type=="private_equity",]
## # A tibble: 1 x 2
##   funding_round_type `mean(raised_amount_usd)`
##                <chr>                     <dbl>
## 1     private_equity                  62111788
#Considering that Spark Funds wants to invest between 5 to 15 million USD per  investment round, 
#which investment type is the most suitable for them?

funding_types_mean[(funding_types_mean$funding_round_type %in% c("venture","seed","angel","private_equity")) & (funding_types_mean$`mean(raised_amount_usd)`>5000000 & funding_types_mean$`mean(raised_amount_usd)`<15000000),]
## # A tibble: 1 x 2
##   funding_round_type `mean(raised_amount_usd)`
##                <chr>                     <dbl>
## 1            venture                  10634054
#From the above command we know that venture type fund is the most suitable fund for spark funds.

#Checkpoint 4: Country Analysis
################################


#Spark Funds wants to see the top nine countries which have received the highest total funding 
# (across ALL sectors for the chosen investment type)

venture_frame<-subset(master_frame,master_frame$funding_round_type=="venture")


#group by country code
group_by_country<-group_by(venture_frame,country_code)

#summarise based on the total amount of fund received and sort it.
summarise_country_sum<-summarise(group_by_country,sum(raised_amount_usd))

#summarise based on count of total investments received by that country.
summarise_country_count<-summarise(group_by_country,length(country_code))

#merge the two data frames
summarise_country_count_sum<-merge(summarise_country_sum,summarise_country_count,by="country_code")
arrange_country<-arrange(summarise_country_count_sum,desc(`sum(raised_amount_usd)`))

#select the top9 countries from the above list.
top9<-arrange_country[1:9,]

#rename 2nd column in top9 data frame
names(top9)[2]<-paste("total_amount_raised")
#From the list above

#Top English speaking country = USA

#Top Second English speaking country = GBR

#Top Third English speaking country = IND


#Checkpoint 5: Sector Analysis 1
##################################

#load mapping file
mapping<-read.csv("mapping.csv",header = TRUE,stringsAsFactors = FALSE)

library(tidyr)
#change data in mapping from wide to long format.
#Removed column Blanks.
mapping$Blanks<-NULL

mapping_long<-gather(mapping,main_sector,value,2:9)

#create subset of the records where value=1. Ignore the other values as they carry no information.
sector_mapping<-subset(mapping_long,mapping_long$value==1)

#setting the value column to NULL since all of them contain value 1
sector_mapping$value<-NULL

#Rename category_list as primary_sector
sector_mapping$primary_sector<-sector_mapping$category_list
sector_mapping$category_list<-NULL

library(stringr)

#Extract the primary sector of each category list from the category_list column
venture_frame$primary_sector<-str_split(venture_frame$category_list,"\\|",simplify = TRUE)[,1]

#Use the mapping file 'mapping.csv' to map each primary sector to one of the eight main sectors 
#(Note that 'Others' is also considered one of the main sectors)
venture_frame_sector<-merge(venture_frame,sector_mapping,by="primary_sector")


#Checkpoint 6: Sector Analysis 2
#################################

#Create three separate data frames D1, D2 and D3 for each of the three countries containing the observations of 
#funding type FT falling within the 5-15 million USD range. The three data frames should contain:

#Get funding types falling in between the range 5 and 15 million
funding_bet_5_15<-subset(venture_frame_sector,venture_frame_sector$raised_amount_usd>=5000000 & venture_frame_sector$raised_amount_usd<=15000000 )


#Create three separate data frames one for each country. From the top9 data frame the top3 english
#speaking countries are "USA","GBR","IND"
D1<-subset(funding_bet_5_15,funding_bet_5_15$country_code=="USA")
D2<-subset(funding_bet_5_15,funding_bet_5_15$country_code=="GBR")
D3<-subset(funding_bet_5_15,funding_bet_5_15$country_code=="IND")

#Total number of investments for each country c1,c2,c3
nrow(D1)
## [1] 11149
nrow(D2)
## [1] 577
nrow(D3)
## [1] 299
#Total amount of investment for each sector
sum(D1$raised_amount_usd)
## [1] 99661524549
sum(D2$raised_amount_usd)
## [1] 5028704358
sum(D3$raised_amount_usd)
## [1] 2683537552
#D1 Group by main_sector along with count of investments and total investment made.
D1_group_by<-group_by(D1,main_sector)
D1_count<-summarise(D1_group_by,length(main_sector))
D1_sum<-summarise(D1_group_by,sum(raised_amount_usd))
#For listing sum and count in one column
D1_count_sum<-merge(D1_count,D1_sum,by="main_sector")

# For Country1- Total number of investments (count), Total amount of investment (USD), Top sector (based on count of investments) 
D1_arrange<-arrange(D1_count_sum,desc(`length(main_sector)`))
D1_top_sector<-D1_arrange[1,1]
D1_secondtop_sector<-D1_arrange[2,1]

#D2 Group by main_sector along with count of investments and total investment made.
D2_group_by<-group_by(D2,main_sector)
D2_count<-summarise(D2_group_by,length(main_sector))
D2_sum<-summarise(D2_group_by,sum(raised_amount_usd))
#For listing sum and count in one column
D2_count_sum<-merge(D2_count,D2_sum,by="main_sector")

# For Country2- Total number of investments (count), Total amount of investment (USD), Top sector (based on count of investments) 
D2_arrange<-arrange(D2_count_sum,desc(`length(main_sector)`))
D2_top_sector<-D2_arrange[1,1]
D2_secondtop_sector<-D2_arrange[2,1]

#D3 Group by main_sector along with count of investments and total investment made.
D3_group_by<-group_by(D3,main_sector)
D3_count<-summarise(D3_group_by,length(main_sector))
D3_sum<-summarise(D3_group_by,sum(raised_amount_usd))
#For listing sum and count in one column
D3_count_sum<-merge(D3_count,D3_sum,by="main_sector")

## For Country1- Total number of investments (count), Total amount of investment (USD), Top sector (based on count of investments) 
D3_arrange<-arrange(D3_count_sum,desc(`length(main_sector)`))
D3_top_sector<-D3_arrange[1,1]
D3_secondtop_sector<-D3_arrange[2,1]



#For point 3 (top sector count-wise), which company received the highest investment for country C1?

s1<-subset(D1,D1$main_sector==D1_top_sector)
s1_group_by<-group_by(s1,permalink)
s1_sum<-summarise(s1_group_by,sum(raised_amount_usd))
s1_arrange<-arrange(s1_sum,desc(`sum(raised_amount_usd)`))
#Get top company key
c1_s1_top_company<-toString(s1_arrange[1,1])

#Get top company name
companies[which(companies$permalink==c1_s1_top_company),"name"]
## [1] "Virtustream"
# which company received the highest investment for country C2?
s2<-subset(D2,D2$main_sector==D2_top_sector)
s2_group_by<-group_by(s2,permalink)
s2_sum<-summarise(s2_group_by,sum(raised_amount_usd))
s2_arrange<-arrange(s2_sum,desc(`sum(raised_amount_usd)`))
#Get top company key
c2_s2_top_company<-toString(s2_arrange[1,1])

#Get top company name
companies[which(companies$permalink==c2_s2_top_company),"name"]
## [1] "Electric Cloud"
#which company received the highest investment for country C3?
s3<-subset(D3,D3$main_sector==D3_top_sector)
s3_group_by<-group_by(s3,permalink)
s3_sum<-summarise(s3_group_by,sum(raised_amount_usd))
s3_arrange<-arrange(s3_sum,desc(`sum(raised_amount_usd)`))
#Get top company key
c3_s3_top_company<-toString(s3_arrange[1,1])

#Get top company name
companies[which(companies$permalink==c3_s3_top_company),"name"]
## [1] "FirstCry.com"
#For point 4 (second best sector count-wise), which company received the highest investment for country1?

s4<-subset(D1,D1$main_sector==D1_secondtop_sector)
s4_group_by<-group_by(s4,permalink)
s4_sum<-summarise(s4_group_by,sum(raised_amount_usd))
s4_arrange<-arrange(s4_sum,desc(`sum(raised_amount_usd)`))
#Get top company key
c1_s4_top_company<-toString(s4_arrange[1,1])

#Get top company name
companies[which(companies$permalink==c1_s4_top_company),"name"]
## [1] "Biodesix"
#For point 4 (second best sector count-wise), which company received the highest investment for country2?
s5<-subset(D2,D2$main_sector==D2_secondtop_sector)
s5_group_by<-group_by(s5,permalink)
s5_sum<-summarise(s5_group_by,sum(raised_amount_usd))
s5_arrange<-arrange(s5_sum,desc(`sum(raised_amount_usd)`))
#Get top company key
c2_s5_top_company<-toString(s5_arrange[1,1])

#Get top company name
companies[which(companies$permalink==c2_s5_top_company),"name"]
## [1] "EUSA Pharma"
##For point 4 (second best sector count-wise), which company received the highest investment for country3?
s6<-subset(D3,D3$main_sector==D3_secondtop_sector)
s6_group_by<-group_by(s6,permalink)
s6_sum<-summarise(s6_group_by,sum(raised_amount_usd))
s6_arrange<-arrange(s6_sum,desc(`sum(raised_amount_usd)`))
#Get top company key
c3_s6_top_company<-toString(s6_arrange[1,1])

#Get top company name
companies[which(companies$permalink==c3_s6_top_company),"name"]
## [1] "GupShup"