library(RJSONIO)
url <- "https://www.bitstamp.net/api/transactions/"
bs_data <- fromJSON(url) # returns a list
bs_df <- do.call(rbind,lapply(bs_data,data.frame,stringsAsFactors=FALSE))
head(bs_df)
## date tid price type amount
## 1 1529634069 68796428 6657.95 0 4.02163929
## 2 1529634052 68796423 6657.95 0 0.03417848
## 3 1529634042 68796418 6657.95 0 0.00108503
## 4 1529634041 68796416 6655.66 0 0.01000000
## 5 1529634040 68796415 6655.55 0 1.00000000
## 6 1529634040 68796414 6655.00 0 2.52216676
summary(bs_df)
## date tid price type
## Length:100 Min. :68796028 Length:100 Min. :0.00
## Class :character 1st Qu.:68796134 Class :character 1st Qu.:0.00
## Mode :character Median :68796286 Mode :character Median :0.00
## Mean :68796256 Mean :0.43
## 3rd Qu.:68796373 3rd Qu.:1.00
## Max. :68796428 Max. :1.00
## amount
## Length:100
## Class :character
## Mode :character
##
##
##
library(plyr)
bs_df2 <- ldply(bs_data,data.frame)
nullToNA <- function(x) {
x[sapply(x, is.null)] <- NA
return(x)
}
head(bs_df2 )
## date tid price type amount
## 1 1529634069 68796428 6657.95 0 4.02163929
## 2 1529634052 68796423 6657.95 0 0.03417848
## 3 1529634042 68796418 6657.95 0 0.00108503
## 4 1529634041 68796416 6655.66 0 0.01000000
## 5 1529634040 68796415 6655.55 0 1.00000000
## 6 1529634040 68796414 6655.00 0 2.52216676
bs_df2$type<- factor(bs_df2$type)
is.factor(bs_df2$type)
## [1] TRUE
bs_df2$price = as.numeric(bs_df2$price)
bs_df2$amount = as.numeric(bs_df2$amount)
summary(bs_df2)
## date tid price type amount
## 1529634040: 5 Min. :68796028 Min. : 1.00 0:57 Min. : 1.00
## 1529633667: 4 1st Qu.:68796134 1st Qu.: 7.00 1:43 1st Qu.:20.75
## 1529633967: 3 Median :68796286 Median :23.50 Median :45.50
## 1529633913: 3 Mean :68796256 Mean :25.45 Mean :44.12
## 1529633836: 3 3rd Qu.:68796373 3rd Qu.:41.25 3rd Qu.:66.25
## 1529633749: 3 Max. :68796428 Max. :59.00 Max. :91.00
## (Other) :79
plot(amount~price,data=bs_df2, main="Relationship between price and amount")
#Standard deviation
sapply(bs_df2,sd)
## Warning in var(if (is.vector(x) || is.factor(x)) x else as.double(x), na.rm = na.rm): Calling var(x) on a factor x is deprecated and will become an error.
## Use something like 'all(duplicated(x)[-1L])' to test for a constant vector.
## Warning in var(if (is.vector(x) || is.factor(x)) x else as.double(x), na.rm = na.rm): Calling var(x) on a factor x is deprecated and will become an error.
## Use something like 'all(duplicated(x)[-1L])' to test for a constant vector.
## date tid price type amount
## 20.0276854 127.6006535 18.3288562 0.4975699 26.8863226
fit= glm(type~amount+price , data=bs_df2, family=binomial(link="logit"))
summary(fit)
##
## Call:
## glm(formula = type ~ amount + price, family = binomial(link = "logit"),
## data = bs_df2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.170 -1.063 -1.031 1.289 1.654
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.32415 0.39817 -0.814 0.416
## amount 0.01185 0.01910 0.621 0.535
## price -0.01896 0.02795 -0.678 0.498
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 136.66 on 99 degrees of freedom
## Residual deviance: 136.18 on 97 degrees of freedom
## AIC: 142.18
##
## Number of Fisher Scoring iterations: 4
getwd()
## [1] "C:/Users/SurbhiK/Documents/ANLY699"
survey<- read.csv("Survey1.csv", header=TRUE)
head(survey)
## AgeGroup Gender Income Race TradedCrypto NameCurrency Profitability
## 1 0 0 3 0 1 Ethereum 0
## 2 0 0 3 0 1 Ripple 0
## 3 1 0 4 2 1 Ethereum 0
## 4 1 0 3 0 1 Ethereum 0
## 5 1 1 3 0 1 Litecoin 0
## 6 0 1 3 0 1 Ethereum 0
count<- c(4, 3,2,1)
namecurrency<- c("Ethereum", "Litecoin", "Ripple","Bitcoin")
barplot(count, main = "Bar Chart for Types of CryptoCurrencies", names.arg=namecurrency)
survey$AgeGroup = as.factor(survey$AgeGroup )
survey$Gender = as.factor(survey$Gender )
survey$Income = as.factor(survey$Income )
survey$Race = as.factor(survey$Race)
summary(survey)
## AgeGroup Gender Income Race TradedCrypto NameCurrency Profitability
## 0:7 0:6 3:6 0:9 Min. :1 Bitcoin :1 Min. :0.0
## 1:3 1:4 4:4 2:1 1st Qu.:1 Ethereum:4 1st Qu.:0.0
## Median :1 Litecoin:3 Median :0.0
## Mean :1 Ripple :2 Mean :0.2
## 3rd Qu.:1 3rd Qu.:0.0
## Max. :1 Max. :1.0
sapply(survey,sd)
## Warning in var(if (is.vector(x) || is.factor(x)) x else as.double(x), na.rm = na.rm): Calling var(x) on a factor x is deprecated and will become an error.
## Use something like 'all(duplicated(x)[-1L])' to test for a constant vector.
## Warning in var(if (is.vector(x) || is.factor(x)) x else as.double(x), na.rm = na.rm): Calling var(x) on a factor x is deprecated and will become an error.
## Use something like 'all(duplicated(x)[-1L])' to test for a constant vector.
## Warning in var(if (is.vector(x) || is.factor(x)) x else as.double(x), na.rm = na.rm): Calling var(x) on a factor x is deprecated and will become an error.
## Use something like 'all(duplicated(x)[-1L])' to test for a constant vector.
## Warning in var(if (is.vector(x) || is.factor(x)) x else as.double(x), na.rm = na.rm): Calling var(x) on a factor x is deprecated and will become an error.
## Use something like 'all(duplicated(x)[-1L])' to test for a constant vector.
## Warning in var(if (is.vector(x) || is.factor(x)) x else as.double(x), na.rm = na.rm): Calling var(x) on a factor x is deprecated and will become an error.
## Use something like 'all(duplicated(x)[-1L])' to test for a constant vector.
## AgeGroup Gender Income Race TradedCrypto
## 0.4830459 0.5163978 0.5163978 0.3162278 0.0000000
## NameCurrency Profitability
## 0.9660918 0.4216370
fit_survey= glm(Profitability~AgeGroup+Gender+Income+Race , data=survey, family=binomial(link="logit"))
summary(fit_survey)
##
## Call:
## glm(formula = Profitability ~ AgeGroup + Gender + Income + Race,
## family = binomial(link = "logit"), data = survey)
##
## Deviance Residuals:
## 1 2 3 4 5 6 7
## -0.73395 -0.73395 -0.00008 -0.00007 -0.00009 -0.83043 -0.83043
## 8 9 10
## 1.69907 1.44080 -0.93513
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.1741 1.2645 -0.928 0.353
## AgeGroup1 -18.5412 7587.8338 -0.002 0.998
## Gender1 0.2867 1.8511 0.155 0.877
## Income4 0.2867 1.8511 0.155 0.877
## Race2 -0.1374 13161.4597 0.000 1.000
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 10.0080 on 9 degrees of freedom
## Residual deviance: 8.2938 on 5 degrees of freedom
## AIC: 18.294
##
## Number of Fisher Scoring iterations: 18
bitcoin_file <- "bitstampUSD.csv.gz"
URL <- "http://api.bitcoincharts.com/v1/csv"
source_file <- file.path(URL,bitcoin_file)
getwd()
## [1] "C:/Users/SurbhiK/Documents/ANLY699"
dataDir <-"C:/Users/SurbhiK/Documents"
dest_file <- file.path(dataDir,bitcoin_file)
download.file(source_file,destfile = dest_file)
raw <- read.csv(gzfile(dest_file),header=FALSE)
head(raw,2)
## V1 V2 V3
## 1 1315922016 5.80 1
## 2 1315922024 5.83 3
names(raw) <- c("unixtime","price","amount")
raw$date <- as.Date(as.POSIXct(raw$unixtime, origin="1970-01-01"))
head(raw,2)
## unixtime price amount date
## 1 1315922016 5.80 1 2011-09-13
## 2 1315922024 5.83 3 2011-09-13
library(plyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(xts)
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
library(dygraphs)
data <- select(raw,-unixtime)
rm(raw)
data <- mutate(data,value = price * amount)
by_date <- group_by(data,date)
daily <- summarise(by_date,count = n(),
m_price <- mean(price, na.rm = TRUE),
m_amount <- mean(amount, na.rm = TRUE),
m_value <- mean(value, na.rm = TRUE))
names(daily) <- c("date","count","m_value","m_price","m_amount")
head(daily,2)
## # A tibble: 2 x 5
## date count m_value m_price m_amount
## <date> <int> <dbl> <dbl> <dbl>
## 1 2011-09-13 12 5.87 4.86 28.8
## 2 2011-09-14 14 5.58 4.37 24.4
str(daily)
## Classes 'tbl_df', 'tbl' and 'data.frame': 2453 obs. of 5 variables:
## $ date : Date, format: "2011-09-13" "2011-09-14" ...
## $ count : int 12 14 6 4 1 8 1 17 5 5 ...
## $ m_value : num 5.87 5.58 5.12 4.83 4.87 ...
## $ m_price : num 4.86 4.37 13.36 9.98 0.3 ...
## $ m_amount: num 28.84 24.42 68.04 48.44 1.46 ...
daily_ts <- xts(daily$m_value,order.by=daily$date)
plot(daily_ts)
#log transformation
newts <- log(daily_ts)
plot(newts)
## as can be seen from the newts plot the graph is not seasonal or cyclical. The data is a trend type time series.
myts <- ts(daily$m_value, start=c(2012, 1), end=c(2017, 12), frequency=12)
plot(myts)
## The time series in myts only runs from January 2012 to December2017. Whereas daily_ts data set runs from September 2011 to June 2018. This is done intentionally so that when forecasting using myts dataset we can see whether the forecast is accurate or not by comparing it to the daily_ts time series plot
fit <- HoltWinters(myts, beta=FALSE, gamma=FALSE)
fit <- HoltWinters(myts, gamma=FALSE)
fit <- HoltWinters(myts)
install.packages(‘forecast’)
library(forecast)
forecast(fit, 3)
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Jan 2018 2.672097 1.997656 3.346538 1.640629 3.703565
## Feb 2018 2.628337 1.844973 3.411702 1.430285 3.826390
## Mar 2018 2.571115 1.692224 3.450006 1.226967 3.915263
plot(forecast(fit, 3))
## Comparing fit dataset forecast for first three months of 2018 to myts forecast for 2018 first 3 months time period. As can be seen in the time series plot the first 3 months of 2018 has an upward trend.