Import Libraries

# Import Libraries
library(RSQLite)
library(DBI)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Warning in fun(libname, pkgname): couldn't connect to display ":0"
library(ggplot2)
library(viridis)
## Loading required package: viridisLite


Initial Data Analysis

# Connect to Database
CFA_db <- dbConnect(SQLite(), "CFA.db")

#Fetch the Result
dbListTables(CFA_db)
## [1] "Agent"      "Company"    "Customer"   "Investment" "Portfolio" 
## [6] "Security"
# Agent Table
agent_data <- tbl(CFA_db, "Agent")
agent_df = collect(agent_data)
agent_df
## # A tibble: 10 × 7
##    agentId agentFName agentLName agentGender agentAddress           agentContact
##      <int> <chr>      <chr>      <chr>       <chr>                  <chr>       
##  1       1 James      Smith      Male        2749 Lee Avenue        8563071183  
##  2       2 Michael    William    Male        4165 Godfrey Road      2125079835  
##  3       3 Maria      Garcia     Female      3393 Marshville Road   8456142839  
##  4       4 David      Rodriguez  Male        1121 Willow Greene Dr… 3343021874  
##  5       5 Elizabeth  Johnson    Female      3755 Trouser Leg Road  4139942499  
##  6       6 Jessica    Addison    Female      2400 Caynor Circle     9084966654  
##  7       7 Emily      Madison    Female      1313 Bryan Street      3362097310  
##  8       8 Jacob      Tyler      Male        2269 Oak Way           4024521227  
##  9       9 Jose       Alexander  Male        4461 Windy Ridge Road  2604874772  
## 10      10 Ava        Isabella   Female      1438 Camden Street     7752547641  
## # … with 1 more variable: agentEmail <chr>
# Company Table
company_data <- tbl(CFA_db, "Company")
company_df = collect(company_data)
company_df
## # A tibble: 8 × 3
##   companyId companyName        companyType            
##       <int> <chr>              <chr>                  
## 1         1 Walmart            Retail                 
## 2         2 Amazon             Retail                 
## 3         3 Apple Inc.         Electronics industry   
## 4         4 CVS Health         Healthcare             
## 5         5 UnitedHealth Group Healthcare             
## 6         6 AmerisourceBergen  Pharmaceutical industry
## 7         7 Microsoft          Technology             
## 8         8 JPMorgan Chase     Financial services
# Customer Table
customer_data <- tbl(CFA_db, "Customer")
customer_df = collect(customer_data)
customer_df
## # A tibble: 10 × 8
##    customerId customerFName customerLName customerGender customerAddreess    
##         <int> <chr>         <chr>         <chr>          <chr>               
##  1          1 Luis          Carlos        Male           4887 Perine Street  
##  2          2 Joseph        Carter        Male           3153 Elmwood Avenue 
##  3          3 Daniel        Elijah        Male           2991 Lena Lane      
##  4          4 Ryan          Jackson       Male           1716 Francis Mine   
##  5          5 Olivia        Abigail       Female         3301 Orchard Street 
##  6          6 Sophia        Mia           Female         1636 Hudson Street  
##  7          7 Natalie       Grace         Female         1751 Florence Street
##  8          8 Diego         Benjamin      Male           4284 Parkway Drive  
##  9          9 Kevin         Logan         Male           4828 Perry Street   
## 10         10 Paola         Genesis       Female         3321 Mulberry Street
## # … with 3 more variables: customerContact <int64>, customerEmail <chr>,
## #   customerJob <chr>
# Investment Table
investment_data <- tbl(CFA_db, "Investment")
investment_df = collect(investment_data)
investment_df
## # A tibble: 15 × 8
##    investmentId customerId companyId agentId Date        unit unitPrice Amount
##           <int>      <int>     <int>   <int> <chr>      <int>     <int>  <int>
##  1            1          2         3       1 12-01-2020    12       125   1500
##  2            2          1         4       3 23-02-2020    15        90   1350
##  3            3          4         7       2 23-02-2020    25       110   2750
##  4            4          3         5       4 25-02-2020    18        96   1728
##  5            5          6         6       7 10-03-2020    30        75   2250
##  6            6          5         2       5 17-03-2020    10       135   1350
##  7            7          7         3       6 15-03-2020    16       125   2000
##  8            8          8         8       9 20-03-2020    20       105   2100
##  9            9         10         1       8 20-03-2020    18       100   1800
## 10           10          9         2      10 20-03-2021    15       135   2025
## 11           11          3         4       4 26-03-2020    25        90   2250
## 12           12          6         4       3 28-03-2020    20        90   1800
## 13           13          4         3       1 04-04-2020    22       125   2750
## 14           14          2         7       2 06-04-2020    28       110   3080
## 15           15          7         7       2 10-04-2020    30       110   3300
# Portfolio Table
portfolio_data <- tbl(CFA_db, "Portfolio")
portfolio_df = collect(portfolio_data)
portfolio_df
## # A tibble: 15 × 5
##    portfolioId customerId investmentId portfolioName portfolioStatus
##          <int>      <int>        <int> <chr>         <chr>          
##  1           1          2            1 Stock         Received       
##  2           2          1            2 Stock         Received       
##  3           3          4            3 Bond          Received       
##  4           4          3            4 Bond          Received       
##  5           5          6            5 Bond          Processing     
##  6           6          5            6 Stock         Received       
##  7           7          7            7 Stock         Cancelled      
##  8           8          8            8 Certificate   Processing     
##  9           9         10            9 Bond          Received       
## 10          10          9           10 Bond          Received       
## 11          11          3           11 Stock         Processing     
## 12          12          6           12 Stock         Processing     
## 13          13          4           13 Bond          Cancelled      
## 14          14          2           14 Stock         Received       
## 15          15          7           15 Stock         Processing
# Security Table
security_data <- tbl(CFA_db, "Security")
security_df = collect(security_data)
security_df
## # A tibble: 15 × 4
##    securityId investmentId securityType securityAmount
##         <int>        <int> <chr>                 <int>
##  1          1            1 Debt                    180
##  2          2            2 Equity                  203
##  3          3            3 Derivative              495
##  4          4            4 Equity                  260
##  5          5            5 Debt                    270
##  6          6            6 Debt                    162
##  7          7            7 Debt                    240
##  8          8            8 Derivative              378
##  9          9            9 Equity                  270
## 10         10           10 Equity                  304
## 11         11           11 Equity                  338
## 12         12           12 Equity                  270
## 13         13           13 Debt                    330
## 14         14           14 Derivative              555
## 15         15           15 Derivative              594


Query 1

0.0.1 Find the customer names and their investment details.

inv_Customers <- sqldf("SELECT customerFName, customerLName, companyName, companyType, Amount
                       FROM customer_df cu, company_df co, investment_df inv
                       WHERE cu.customerId = inv.customerId AND
                       co.companyId = inv.companyId", method = "raw")
inv_Customers
##    customerFName customerLName        companyName             companyType
## 1           Luis        Carlos         CVS Health              Healthcare
## 2         Joseph        Carter         Apple Inc.    Electronics industry
## 3         Joseph        Carter          Microsoft              Technology
## 4         Daniel        Elijah         CVS Health              Healthcare
## 5         Daniel        Elijah UnitedHealth Group              Healthcare
## 6           Ryan       Jackson         Apple Inc.    Electronics industry
## 7           Ryan       Jackson          Microsoft              Technology
## 8         Olivia       Abigail             Amazon                  Retail
## 9         Sophia           Mia         CVS Health              Healthcare
## 10        Sophia           Mia  AmerisourceBergen Pharmaceutical industry
## 11       Natalie         Grace         Apple Inc.    Electronics industry
## 12       Natalie         Grace          Microsoft              Technology
## 13         Diego      Benjamin     JPMorgan Chase      Financial services
## 14         Kevin         Logan             Amazon                  Retail
## 15         Paola       Genesis            Walmart                  Retail
##    Amount
## 1    1350
## 2    1500
## 3    3080
## 4    2250
## 5    1728
## 6    2750
## 7    2750
## 8    1350
## 9    1800
## 10   2250
## 11   2000
## 12   3300
## 13   2100
## 14   2025
## 15   1800

0.0.2 Visualization.

ggplot(inv_Customers, aes(fill=companyType, x=customerFName, y=Amount)) + 
  geom_bar(position="stack", stat="identity") +
  scale_fill_viridis(discrete = T) +
  xlab("Customer Name") +
  ylab("Invested Amount") +
  ggtitle("Investment Details")

Query 2

0.0.3 Find the customer names and portfolio details for received portfolio status.

pf_Rec_Customers <- sqldf("SELECT customerFName, customerLName, portfolioName, portfolioStatus
                       FROM customer_df c, portfolio_df p
                       WHERE c.customerId = p.customerId AND
                       p.portfolioStatus = 'Received'", method = "raw")
pf_Rec_Customers
##   customerFName customerLName portfolioName portfolioStatus
## 1        Joseph        Carter         Stock        Received
## 2          Luis        Carlos         Stock        Received
## 3          Ryan       Jackson          Bond        Received
## 4        Daniel        Elijah          Bond        Received
## 5        Olivia       Abigail         Stock        Received
## 6         Paola       Genesis          Bond        Received
## 7         Kevin         Logan          Bond        Received
## 8        Joseph        Carter         Stock        Received

Query 3

0.0.4 Find the customer names and their investment details who have invested in Healthcare industry.

inv_Hc_Customers <- sqldf("SELECT customerFName, customerLName, companyName, companyType, Amount
                       FROM customer_df cu, company_df co, investment_df inv
                       WHERE cu.customerId = inv.customerId AND
                       co.companyId = inv.companyId AND
                       co.companyType = 'Healthcare'", method = "raw")
inv_Hc_Customers
##   customerFName customerLName        companyName companyType Amount
## 1          Luis        Carlos         CVS Health  Healthcare   1350
## 2        Daniel        Elijah         CVS Health  Healthcare   2250
## 3        Sophia           Mia         CVS Health  Healthcare   1800
## 4        Daniel        Elijah UnitedHealth Group  Healthcare   1728

0.0.5 Visualization.

ggplot(inv_Hc_Customers, aes(y=Amount, x=customerFName)) + 
  geom_boxplot(
    color="blue",
    fill="blue",
    alpha=0.2,
    notch=FALSE,
    notchwidth = 0.8,
    outlier.colour="red",
    outlier.fill="red",
    outlier.size=3
  ) +
  xlab("Customer Name") +
  ylab("Invested Amount") +
  ggtitle("Healthcare Investment Details")

Query 4

0.0.6 Find the customer names and their investment details who have invested higher than the average invested amount.

avg_Amt_Customers <- sqldf("SELECT customerFName, customerLName, companyName, companyType, Amount
                           FROM customer_df cu, company_df co, investment_df inv
                           WHERE cu.customerId = inv.customerId AND
                           co.companyId = inv.companyId AND
                           amount > (SELECT AVG(amount) FROM investment_df)", method = "raw")
avg_Amt_Customers
##   customerFName customerLName       companyName             companyType Amount
## 1          Ryan       Jackson         Microsoft              Technology   2750
## 2        Sophia           Mia AmerisourceBergen Pharmaceutical industry   2250
## 3        Daniel        Elijah        CVS Health              Healthcare   2250
## 4          Ryan       Jackson        Apple Inc.    Electronics industry   2750
## 5        Joseph        Carter         Microsoft              Technology   3080
## 6       Natalie         Grace         Microsoft              Technology   3300

Query 5

0.0.7 Find the unit price of the products offered by each of the companies.

up_Type_Companies <- sqldf("SELECT companyName, companyType, unitPrice
                         FROM company_df co, investment_df inv
                         WHERE co.companyId = inv.companyId
                         GROUP BY companyName", method = "raw")
up_Type_Companies
##          companyName             companyType unitPrice
## 1             Amazon                  Retail       135
## 2  AmerisourceBergen Pharmaceutical industry        75
## 3         Apple Inc.    Electronics industry       125
## 4         CVS Health              Healthcare        90
## 5     JPMorgan Chase      Financial services       105
## 6          Microsoft              Technology       110
## 7 UnitedHealth Group              Healthcare        96
## 8            Walmart                  Retail       100

0.0.8 Visualization.

ggplot(data = up_Type_Companies, mapping = aes(fill=companyName, x = companyName, y = unitPrice)) +
  geom_bar(stat = "identity") +
  xlab("Company Name") +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ylab("Unit Price") +
  ggtitle("Unit Price Analysis")

Query 6

0.0.9 Find the total investment amount for each type of companies.

inv_Type_Companies <- sqldf("SELECT companyType, sum(amount) as investedAmount
                         FROM company_df co, investment_df inv
                         WHERE co.companyId = inv.companyId
                         GROUP BY companyType", method = "raw")
inv_Type_Companies
##               companyType investedAmount
## 1    Electronics industry           6250
## 2      Financial services           2100
## 3              Healthcare           7128
## 4 Pharmaceutical industry           2250
## 5                  Retail           5175
## 6              Technology           9130

0.0.10 Visualization.

ggplot(data = inv_Type_Companies, mapping = aes(fill=companyType, x = companyType, y = investedAmount)) +
  geom_bar(stat = "identity") +
  xlab("Company Type") +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ylab("Invested Amount") +
  ggtitle("Investment Analysis")