Lab 17: Working with financial data and SQL

Learning Objectives:

  • learn basic syntax of SQL

1. Make a connection to SQL database

SQL (Structured Query Language) is the most common language for extracting data from databases. This lab will introduce basic sytax on which you can build in the future. Rather than downloading new software (such as SQL Server, MySQL) and typing SQL commands in that software, we will make a connection to a SQL database within R and query the database from within R. We need two packages DBI and RSQLite. You should also download file finance and put it in your working directory.

library(DBI)
library(RSQLite)
## Warning: package 'RSQLite' was built under R version 3.3.3
con <- dbConnect(SQLite(), "C:/Users/dvorakt/Google Drive/finance")
#con <- dbConnect(SQLite(), "finance")

We now have a connection to a database called finance. The database has two tables: stocks and tickers, each with lots of different variables.

dbListTables(con)
## [1] "stocks"  "tickers"
dbListFields(con, "stocks")
##   [1] "ticker"         "dimension"      "calendardate"   "datekey"       
##   [5] "reportperiod"   "accoci"         "assets"         "assetsavg"     
##   [9] "assetsc"        "assetsnc"       "assetturnover"  "bvps"          
##  [13] "capex"          "cashneq"        "cashnequsd"     "cor"           
##  [17] "consolinc"      "currentratio"   "de"             "debt"          
##  [21] "debtc"          "debtnc"         "debtusd"        "deferredrev"   
##  [25] "depamor"        "deposits"       "divyield"       "dps"           
##  [29] "ebit"           "ebitda"         "ebitdamargin"   "ebitdausd"     
##  [33] "ebitusd"        "ebt"            "eps"            "epsdil"        
##  [37] "epsusd"         "equity"         "equityavg"      "equityusd"     
##  [41] "ev"             "evebit"         "evebitda"       "fcf"           
##  [45] "fcfps"          "fxusd"          "gp"             "grossmargin"   
##  [49] "intangibles"    "intexp"         "invcap"         "invcapavg"     
##  [53] "inventory"      "investments"    "investmentsc"   "investmentsnc" 
##  [57] "liabilities"    "liabilitiesc"   "liabilitiesnc"  "marketcap"     
##  [61] "ncf"            "ncfbus"         "ncfcommon"      "ncfdebt"       
##  [65] "ncfdiv"         "ncff"           "ncfi"           "ncfinv"        
##  [69] "ncfo"           "ncfx"           "netinc"         "netinccmn"     
##  [73] "netinccmnusd"   "netincdis"      "netincnci"      "netmargin"     
##  [77] "opex"           "opinc"          "payables"       "payoutratio"   
##  [81] "pb"             "pe"             "pe1"            "ppnenet"       
##  [85] "prefdivis"      "price"          "ps"             "ps1"           
##  [89] "receivables"    "retearn"        "revenue"        "revenueusd"    
##  [93] "rnd"            "roa"            "roe"            "roic"          
##  [97] "ros"            "sbcomp"         "sgna"           "sharefactor"   
## [101] "sharesbas"      "shareswa"       "shareswadil"    "sps"           
## [105] "tangibles"      "taxassets"      "taxexp"         "taxliabilities"
## [109] "tbvps"          "workingcapital"
dbListFields(con, "tickers")
##  [1] "Ticker"             "Name"               "CUSIP"             
##  [4] "Fama.Industry"      "Currency"           "Sector"            
##  [7] "Industry"           "Last.Updated"       "Prior.Tickers"     
## [10] "Ticker.Change.Date" "Related.Tickers"    "Exchange"          
## [13] "SIC"                "Perma.Ticker"       "Location"          
## [16] "Delisted.From"      "Is.Foreign"

2. SELECT and WHERE statements

We’ll type the query as if we were in a database software and then pass that query to the database using the dbGetQuery functions. The SELECT statement is used to select data from a database.

query1 <- "SELECT datekey, ticker, price, pe FROM stocks"
test <- dbGetQuery(con, query1)
head(test)
##   datekey ticker price     pe
## 1   12627    APD 49.80     NA
## 2   12765    APD 57.84 21.764
## 3   12822    APD 59.91     NA
## 4   12900    APD 56.07 19.096
## 5   12998    APD 61.23 19.370
## 6   13109    APD 60.43 18.859

Clause WHERE is similar to filter and allows us to specify which rows we want to select.

query2 <- "SELECT datekey, ticker, price, pe FROM stocks WHERE ticker='AAPL'"
test <- dbGetQuery(con, query2)
head(test)
##   datekey ticker price      pe
## 1   12093   AAPL 1.025 272.256
## 2   12185   AAPL 1.334      NA
## 3   12276   AAPL 1.407      NA
## 4   12405   AAPL 1.407 104.991
## 5   12458   AAPL 1.641  60.689
## 6   12544   AAPL 1.899  58.730

3. INNER JOIN statement

query3 <- "SELECT stocks.ticker, datekey, price, pe, sector, name FROM stocks INNER JOIN tickers ON stocks.ticker =  tickers.Ticker"
test <- dbGetQuery(con, query3)
head(test)
##   ticker datekey price     pe          Sector                         Name
## 1    APD   12627 49.80     NA Basic Materials AIR PRODUCTS & CHEMICALS INC
## 2    APD   12765 57.84 21.764 Basic Materials AIR PRODUCTS & CHEMICALS INC
## 3    APD   12822 59.91     NA Basic Materials AIR PRODUCTS & CHEMICALS INC
## 4    APD   12900 56.07 19.096 Basic Materials AIR PRODUCTS & CHEMICALS INC
## 5    APD   12998 61.23 19.370 Basic Materials AIR PRODUCTS & CHEMICALS INC
## 6    APD   13109 60.43 18.859 Basic Materials AIR PRODUCTS & CHEMICALS INC

4. CREATE TABLE statement

query4 <- "CREATE TABLE stocks2 AS SELECT stocks.ticker, datekey, price, pe, roa, sector, name FROM stocks INNER JOIN tickers ON stocks.ticker =  tickers.Ticker"
dbExecute(con, query4)
## [1] 0
dbListTables(con)
## [1] "stocks"  "stocks2" "tickers"
dbListFields(con, "stocks2")
## [1] "ticker"  "datekey" "price"   "pe"      "roa"     "Sector"  "Name"

5. GROUP BY statement

query5 <- "SELECT sector , AVG(pe) FROM stocks2 GROUP BY sector"  
test <- dbGetQuery(con, query5)
test
##                   Sector  AVG(pe)
## 1        Basic Materials 50.20280
## 2          Conglomerates 43.09143
## 3         Consumer Goods 37.02569
## 4  Diversified Machinery       NA
## 5              Financial 42.76497
## 6             Healthcare 67.96055
## 7       Industrial Goods 42.55980
## 8                   None 37.70597
## 9               Services 66.88780
## 10            Technology 90.93233
## 11             Utilities 22.12893

Can you do the same thing in R?

library(dplyr)
stocks2 <- dbGetQuery(con, "SELECT * FROM stocks2")
bysector <- stocks2 %>% group_by(Sector) %>% summarize(pe=mean(pe, na.rm=TRUE))

6. Explore predictive power of P/E

library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(ggplot2)
stocks2 <- stocks2 %>% group_by(ticker) %>% mutate(n=n()) %>% filter(n>30)
stocks2$date <- as.Date(stocks2$datekey, origin = "1970-01-01")
stocks2$year <- year(stocks2$date)
stocks2$q <- quarter(stocks2$date)
stocks2 <- stocks2 %>% group_by(year, q) %>% mutate(decile_pe=ntile(pe,10)) %>% arrange(year, q, decile_pe)

first <- filter(stocks2, year==2003 & q==1)
first <- first %>% ungroup() %>% 
  rename(price_first=price, pe_first=pe, decile_first=decile_pe) %>% 
  select(ticker, price_first, decile_first)

stocks2 <- inner_join(stocks2,first, by="ticker")
stocks2$price_norm <- stocks2$price/stocks2$price_first*100
stocks2$date <- ydm(paste(stocks2$year,1,stocks2$q*3))
bydecile <- stocks2 %>% group_by(date, decile_first) %>% summarize(price_norm=mean(price_norm))
ggplot(bydecile, aes(x=date,y=price_norm,color=as.factor(decile_first))) + geom_line() 

#d <- filter(stocks2, (year==2003 | year==2006 | year==2009 |year==2012 | year==2015) & q==1)
#d <- arrange(d, ticker, date)
#d$ret <- (1+log(d$price)-log(lag(d$price)))^(1/3)-1
#d$ret <- ifelse(ticker!=lag(ticker),NA,d$ret)