Hypothesis 1 - Regression analysis

Collecting Data

library(RJSONIO)

url <- "https://www.bitstamp.net/api/transactions/"
bs_data <- fromJSON(url) # returns a list
bs_df <- do.call(rbind,lapply(bs_data,data.frame,stringsAsFactors=FALSE))
head(bs_df)
##         date      tid   price type     amount
## 1 1529634069 68796428 6657.95    0 4.02163929
## 2 1529634052 68796423 6657.95    0 0.03417848
## 3 1529634042 68796418 6657.95    0 0.00108503
## 4 1529634041 68796416 6655.66    0 0.01000000
## 5 1529634040 68796415 6655.55    0 1.00000000
## 6 1529634040 68796414 6655.00    0 2.52216676

Summary of the data

summary(bs_df)
##      date                tid              price                type     
##  Length:100         Min.   :68796028   Length:100         Min.   :0.00  
##  Class :character   1st Qu.:68796134   Class :character   1st Qu.:0.00  
##  Mode  :character   Median :68796286   Mode  :character   Median :0.00  
##                     Mean   :68796256                      Mean   :0.43  
##                     3rd Qu.:68796373                      3rd Qu.:1.00  
##                     Max.   :68796428                      Max.   :1.00  
##     amount         
##  Length:100        
##  Class :character  
##  Mode  :character  
##                    
##                    
## 

To apply mean value for missing values

library(plyr)
bs_df2 <- ldply(bs_data,data.frame)
nullToNA <- function(x) {
  x[sapply(x, is.null)] <- NA
  return(x)
}
head(bs_df2 )
##         date      tid   price type     amount
## 1 1529634069 68796428 6657.95    0 4.02163929
## 2 1529634052 68796423 6657.95    0 0.03417848
## 3 1529634042 68796418 6657.95    0 0.00108503
## 4 1529634041 68796416 6655.66    0 0.01000000
## 5 1529634040 68796415 6655.55    0 1.00000000
## 6 1529634040 68796414 6655.00    0 2.52216676

Converting variables to factors

bs_df2$type<- factor(bs_df2$type)
is.factor(bs_df2$type)
## [1] TRUE

Converting character variables to numeric variables

bs_df2$price = as.numeric(bs_df2$price)
bs_df2$amount = as.numeric(bs_df2$amount)
summary(bs_df2)
##          date         tid               price       type       amount     
##  1529634040: 5   Min.   :68796028   Min.   : 1.00   0:57   Min.   : 1.00  
##  1529633667: 4   1st Qu.:68796134   1st Qu.: 7.00   1:43   1st Qu.:20.75  
##  1529633967: 3   Median :68796286   Median :23.50          Median :45.50  
##  1529633913: 3   Mean   :68796256   Mean   :25.45          Mean   :44.12  
##  1529633836: 3   3rd Qu.:68796373   3rd Qu.:41.25          3rd Qu.:66.25  
##  1529633749: 3   Max.   :68796428   Max.   :59.00          Max.   :91.00  
##  (Other)   :79

Relationship between price and amount

plot(amount~price,data=bs_df2, main="Relationship between price and amount")

#Standard deviation

sapply(bs_df2,sd)
## Warning in var(if (is.vector(x) || is.factor(x)) x else as.double(x), na.rm = na.rm): Calling var(x) on a factor x is deprecated and will become an error.
##   Use something like 'all(duplicated(x)[-1L])' to test for a constant vector.

## Warning in var(if (is.vector(x) || is.factor(x)) x else as.double(x), na.rm = na.rm): Calling var(x) on a factor x is deprecated and will become an error.
##   Use something like 'all(duplicated(x)[-1L])' to test for a constant vector.
##        date         tid       price        type      amount 
##  20.0276854 127.6006535  18.3288562   0.4975699  26.8863226

Running the regressions

fit= glm(type~amount+price , data=bs_df2, family=binomial(link="logit"))
summary(fit)
## 
## Call:
## glm(formula = type ~ amount + price, family = binomial(link = "logit"), 
##     data = bs_df2)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.170  -1.063  -1.031   1.289   1.654  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.32415    0.39817  -0.814    0.416
## amount       0.01185    0.01910   0.621    0.535
## price       -0.01896    0.02795  -0.678    0.498
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 136.66  on 99  degrees of freedom
## Residual deviance: 136.18  on 97  degrees of freedom
## AIC: 142.18
## 
## Number of Fisher Scoring iterations: 4

None of the independent variables are statistically significant that is why we accpet the null hypothesis that amount and price fields do not impact the purchase of bitcoin.

Hypothesis 2 - Survey and Logit Rergression

getwd()
## [1] "C:/Users/SurbhiK/Documents/ANLY699"
survey<- read.csv("Survey1.csv", header=TRUE)
head(survey)
##   AgeGroup Gender Income Race TradedCrypto NameCurrency Profitability
## 1        0      0      3    0            1     Ethereum             0
## 2        0      0      3    0            1       Ripple             0
## 3        1      0      4    2            1     Ethereum             0
## 4        1      0      3    0            1     Ethereum             0
## 5        1      1      3    0            1     Litecoin             0
## 6        0      1      3    0            1     Ethereum             0
count<- c(4, 3,2,1)
namecurrency<- c("Ethereum", "Litecoin", "Ripple","Bitcoin")
barplot(count, main = "Bar Chart for Types of CryptoCurrencies", names.arg=namecurrency)

survey$AgeGroup = as.factor(survey$AgeGroup )
survey$Gender = as.factor(survey$Gender )
survey$Income = as.factor(survey$Income )
survey$Race = as.factor(survey$Race)
summary(survey)
##  AgeGroup Gender Income Race   TradedCrypto   NameCurrency Profitability
##  0:7      0:6    3:6    0:9   Min.   :1     Bitcoin :1     Min.   :0.0  
##  1:3      1:4    4:4    2:1   1st Qu.:1     Ethereum:4     1st Qu.:0.0  
##                               Median :1     Litecoin:3     Median :0.0  
##                               Mean   :1     Ripple  :2     Mean   :0.2  
##                               3rd Qu.:1                    3rd Qu.:0.0  
##                               Max.   :1                    Max.   :1.0
sapply(survey,sd)
## Warning in var(if (is.vector(x) || is.factor(x)) x else as.double(x), na.rm = na.rm): Calling var(x) on a factor x is deprecated and will become an error.
##   Use something like 'all(duplicated(x)[-1L])' to test for a constant vector.

## Warning in var(if (is.vector(x) || is.factor(x)) x else as.double(x), na.rm = na.rm): Calling var(x) on a factor x is deprecated and will become an error.
##   Use something like 'all(duplicated(x)[-1L])' to test for a constant vector.

## Warning in var(if (is.vector(x) || is.factor(x)) x else as.double(x), na.rm = na.rm): Calling var(x) on a factor x is deprecated and will become an error.
##   Use something like 'all(duplicated(x)[-1L])' to test for a constant vector.

## Warning in var(if (is.vector(x) || is.factor(x)) x else as.double(x), na.rm = na.rm): Calling var(x) on a factor x is deprecated and will become an error.
##   Use something like 'all(duplicated(x)[-1L])' to test for a constant vector.

## Warning in var(if (is.vector(x) || is.factor(x)) x else as.double(x), na.rm = na.rm): Calling var(x) on a factor x is deprecated and will become an error.
##   Use something like 'all(duplicated(x)[-1L])' to test for a constant vector.
##      AgeGroup        Gender        Income          Race  TradedCrypto 
##     0.4830459     0.5163978     0.5163978     0.3162278     0.0000000 
##  NameCurrency Profitability 
##     0.9660918     0.4216370
fit_survey= glm(Profitability~AgeGroup+Gender+Income+Race , data=survey, family=binomial(link="logit"))
summary(fit_survey)
## 
## Call:
## glm(formula = Profitability ~ AgeGroup + Gender + Income + Race, 
##     family = binomial(link = "logit"), data = survey)
## 
## Deviance Residuals: 
##        1         2         3         4         5         6         7  
## -0.73395  -0.73395  -0.00008  -0.00007  -0.00009  -0.83043  -0.83043  
##        8         9        10  
##  1.69907   1.44080  -0.93513  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)
## (Intercept)    -1.1741     1.2645  -0.928    0.353
## AgeGroup1     -18.5412  7587.8338  -0.002    0.998
## Gender1         0.2867     1.8511   0.155    0.877
## Income4         0.2867     1.8511   0.155    0.877
## Race2          -0.1374 13161.4597   0.000    1.000
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 10.0080  on 9  degrees of freedom
## Residual deviance:  8.2938  on 5  degrees of freedom
## AIC: 18.294
## 
## Number of Fisher Scoring iterations: 18

None of the independent variables are statistically significant that is why we accpet the null hypothesis that the demographics do not impact the cryptocurrency market.

Hypothesis 3 - Timeseries

bitcoin_file <- "bitstampUSD.csv.gz"
URL <- "http://api.bitcoincharts.com/v1/csv"
source_file <- file.path(URL,bitcoin_file)

Data destination on local disk

getwd()
## [1] "C:/Users/SurbhiK/Documents/ANLY699"
dataDir <-"C:/Users/SurbhiK/Documents"
dest_file <- file.path(dataDir,bitcoin_file)

Download to disk

download.file(source_file,destfile = dest_file)

Uncompress .gz file

raw <- read.csv(gzfile(dest_file),header=FALSE)
head(raw,2)
##           V1   V2 V3
## 1 1315922016 5.80  1
## 2 1315922024 5.83  3
names(raw) <- c("unixtime","price","amount")
raw$date <- as.Date(as.POSIXct(raw$unixtime, origin="1970-01-01"))
head(raw,2)
##     unixtime price amount       date
## 1 1315922016  5.80      1 2011-09-13
## 2 1315922024  5.83      3 2011-09-13
library(plyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(zoo)
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(xts)
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
library(dygraphs)
data <- select(raw,-unixtime)
rm(raw)
data <- mutate(data,value = price * amount)
by_date <- group_by(data,date)
daily <- summarise(by_date,count = n(),
                   m_price <-  mean(price, na.rm = TRUE),
                   m_amount <- mean(amount, na.rm = TRUE),
                   m_value <-  mean(value, na.rm = TRUE))
 
names(daily) <- c("date","count","m_value","m_price","m_amount")
head(daily,2)
## # A tibble: 2 x 5
##   date       count m_value m_price m_amount
##   <date>     <int>   <dbl>   <dbl>    <dbl>
## 1 2011-09-13    12    5.87    4.86     28.8
## 2 2011-09-14    14    5.58    4.37     24.4
str(daily)
## Classes 'tbl_df', 'tbl' and 'data.frame':    2453 obs. of  5 variables:
##  $ date    : Date, format: "2011-09-13" "2011-09-14" ...
##  $ count   : int  12 14 6 4 1 8 1 17 5 5 ...
##  $ m_value : num  5.87 5.58 5.12 4.83 4.87 ...
##  $ m_price : num  4.86 4.37 13.36 9.98 0.3 ...
##  $ m_amount: num  28.84 24.42 68.04 48.44 1.46 ...

Make the m_value variable into a time series object

daily_ts <- xts(daily$m_value,order.by=daily$date)
plot(daily_ts)

#log transformation

newts <- log(daily_ts)
plot(newts)

## as can be seen from the newts plot the graph is not seasonal or cyclical. The data is a trend type time series.

prediction

myts <- ts(daily$m_value, start=c(2012, 1), end=c(2017, 12), frequency=12)
plot(myts)

## The time series in myts only runs from January 2012 to December2017. Whereas daily_ts data set runs from September 2011 to June 2018. This is done intentionally so that when forecasting using myts dataset we can see whether the forecast is accurate or not by comparing it to the daily_ts time series plot

simple exponential - models level

fit <- HoltWinters(myts, beta=FALSE, gamma=FALSE)

double exponential - models level and trend

fit <- HoltWinters(myts, gamma=FALSE)

triple exponential - models level, trend, and seasonal components

fit <- HoltWinters(myts)

predictive accuracy and predict next three future values

install.packages(‘forecast’)

library(forecast)
forecast(fit, 3)
##          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## Jan 2018       2.672097 1.997656 3.346538 1.640629 3.703565
## Feb 2018       2.628337 1.844973 3.411702 1.430285 3.826390
## Mar 2018       2.571115 1.692224 3.450006 1.226967 3.915263
plot(forecast(fit, 3))

## Comparing fit dataset forecast for first three months of 2018 to myts forecast for 2018 first 3 months time period. As can be seen in the time series plot the first 3 months of 2018 has an upward trend.