SQL (Structured Query Language) is the most common language for extracting data from databases. This lab will introduce basic sytax on which you can build in the future. Rather than downloading new software (such as SQL Server, MySQL) and typing SQL commands in that software, we will make a connection to a SQL database within R and query the database from within R. We need two packages DBI and RSQLite. You should also download file finance and put it in your working directory.
library(DBI)
library(RSQLite)
## Warning: package 'RSQLite' was built under R version 3.3.3
con <- dbConnect(SQLite(), "C:/Users/dvorakt/Google Drive/finance")
#con <- dbConnect(SQLite(), "finance")
We now have a connection to a database called finance. The database has two tables: stocks and tickers, each with lots of different variables.
dbListTables(con)
## [1] "stocks" "tickers"
dbListFields(con, "stocks")
## [1] "ticker" "dimension" "calendardate" "datekey"
## [5] "reportperiod" "accoci" "assets" "assetsavg"
## [9] "assetsc" "assetsnc" "assetturnover" "bvps"
## [13] "capex" "cashneq" "cashnequsd" "cor"
## [17] "consolinc" "currentratio" "de" "debt"
## [21] "debtc" "debtnc" "debtusd" "deferredrev"
## [25] "depamor" "deposits" "divyield" "dps"
## [29] "ebit" "ebitda" "ebitdamargin" "ebitdausd"
## [33] "ebitusd" "ebt" "eps" "epsdil"
## [37] "epsusd" "equity" "equityavg" "equityusd"
## [41] "ev" "evebit" "evebitda" "fcf"
## [45] "fcfps" "fxusd" "gp" "grossmargin"
## [49] "intangibles" "intexp" "invcap" "invcapavg"
## [53] "inventory" "investments" "investmentsc" "investmentsnc"
## [57] "liabilities" "liabilitiesc" "liabilitiesnc" "marketcap"
## [61] "ncf" "ncfbus" "ncfcommon" "ncfdebt"
## [65] "ncfdiv" "ncff" "ncfi" "ncfinv"
## [69] "ncfo" "ncfx" "netinc" "netinccmn"
## [73] "netinccmnusd" "netincdis" "netincnci" "netmargin"
## [77] "opex" "opinc" "payables" "payoutratio"
## [81] "pb" "pe" "pe1" "ppnenet"
## [85] "prefdivis" "price" "ps" "ps1"
## [89] "receivables" "retearn" "revenue" "revenueusd"
## [93] "rnd" "roa" "roe" "roic"
## [97] "ros" "sbcomp" "sgna" "sharefactor"
## [101] "sharesbas" "shareswa" "shareswadil" "sps"
## [105] "tangibles" "taxassets" "taxexp" "taxliabilities"
## [109] "tbvps" "workingcapital"
dbListFields(con, "tickers")
## [1] "Ticker" "Name" "CUSIP"
## [4] "Fama.Industry" "Currency" "Sector"
## [7] "Industry" "Last.Updated" "Prior.Tickers"
## [10] "Ticker.Change.Date" "Related.Tickers" "Exchange"
## [13] "SIC" "Perma.Ticker" "Location"
## [16] "Delisted.From" "Is.Foreign"
We’ll type the query as if we were in a database software and then pass that query to the database using the dbGetQuery functions. The SELECT statement is used to select data from a database.
query1 <- "SELECT datekey, ticker, price, pe FROM stocks"
test <- dbGetQuery(con, query1)
head(test)
## datekey ticker price pe
## 1 12627 APD 49.80 NA
## 2 12765 APD 57.84 21.764
## 3 12822 APD 59.91 NA
## 4 12900 APD 56.07 19.096
## 5 12998 APD 61.23 19.370
## 6 13109 APD 60.43 18.859
Clause WHERE is similar to filter and allows us to specify which rows we want to select.
query2 <- "SELECT datekey, ticker, price, pe FROM stocks WHERE ticker='AAPL'"
test <- dbGetQuery(con, query2)
head(test)
## datekey ticker price pe
## 1 12093 AAPL 1.025 272.256
## 2 12185 AAPL 1.334 NA
## 3 12276 AAPL 1.407 NA
## 4 12405 AAPL 1.407 104.991
## 5 12458 AAPL 1.641 60.689
## 6 12544 AAPL 1.899 58.730
query3 <- "SELECT stocks.ticker, datekey, price, pe, sector, name FROM stocks INNER JOIN tickers ON stocks.ticker = tickers.Ticker"
test <- dbGetQuery(con, query3)
head(test)
## ticker datekey price pe Sector Name
## 1 APD 12627 49.80 NA Basic Materials AIR PRODUCTS & CHEMICALS INC
## 2 APD 12765 57.84 21.764 Basic Materials AIR PRODUCTS & CHEMICALS INC
## 3 APD 12822 59.91 NA Basic Materials AIR PRODUCTS & CHEMICALS INC
## 4 APD 12900 56.07 19.096 Basic Materials AIR PRODUCTS & CHEMICALS INC
## 5 APD 12998 61.23 19.370 Basic Materials AIR PRODUCTS & CHEMICALS INC
## 6 APD 13109 60.43 18.859 Basic Materials AIR PRODUCTS & CHEMICALS INC
query4 <- "CREATE TABLE stocks2 AS SELECT stocks.ticker, datekey, price, pe, roa, sector, name FROM stocks INNER JOIN tickers ON stocks.ticker = tickers.Ticker"
dbExecute(con, query4)
## [1] 0
dbListTables(con)
## [1] "stocks" "stocks2" "tickers"
dbListFields(con, "stocks2")
## [1] "ticker" "datekey" "price" "pe" "roa" "Sector" "Name"
query5 <- "SELECT sector , AVG(pe) FROM stocks2 GROUP BY sector"
test <- dbGetQuery(con, query5)
test
## Sector AVG(pe)
## 1 Basic Materials 50.20280
## 2 Conglomerates 43.09143
## 3 Consumer Goods 37.02569
## 4 Diversified Machinery NA
## 5 Financial 42.76497
## 6 Healthcare 67.96055
## 7 Industrial Goods 42.55980
## 8 None 37.70597
## 9 Services 66.88780
## 10 Technology 90.93233
## 11 Utilities 22.12893
Can you do the same thing in R?
library(dplyr)
stocks2 <- dbGetQuery(con, "SELECT * FROM stocks2")
bysector <- stocks2 %>% group_by(Sector) %>% summarize(pe=mean(pe, na.rm=TRUE))
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(ggplot2)
stocks2 <- stocks2 %>% group_by(ticker) %>% mutate(n=n()) %>% filter(n>30)
stocks2$date <- as.Date(stocks2$datekey, origin = "1970-01-01")
stocks2$year <- year(stocks2$date)
stocks2$q <- quarter(stocks2$date)
stocks2 <- stocks2 %>% group_by(year, q) %>% mutate(decile_pe=ntile(pe,10)) %>% arrange(year, q, decile_pe)
first <- filter(stocks2, year==2003 & q==1)
first <- first %>% ungroup() %>%
rename(price_first=price, pe_first=pe, decile_first=decile_pe) %>%
select(ticker, price_first, decile_first)
stocks2 <- inner_join(stocks2,first, by="ticker")
stocks2$price_norm <- stocks2$price/stocks2$price_first*100
stocks2$date <- ydm(paste(stocks2$year,1,stocks2$q*3))
bydecile <- stocks2 %>% group_by(date, decile_first) %>% summarize(price_norm=mean(price_norm))
ggplot(bydecile, aes(x=date,y=price_norm,color=as.factor(decile_first))) + geom_line()
#d <- filter(stocks2, (year==2003 | year==2006 | year==2009 |year==2012 | year==2015) & q==1)
#d <- arrange(d, ticker, date)
#d$ret <- (1+log(d$price)-log(lag(d$price)))^(1/3)-1
#d$ret <- ifelse(ticker!=lag(ticker),NA,d$ret)