library(bitops)
library(RCurl)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.3.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.3.3
## corrplot 0.84 loaded
library(caTools)
## Warning: package 'caTools' was built under R version 3.3.3
url <- getURL("https://raw.githubusercontent.com/VioletaStoyanova/data602-finalproject/master/data_bcoin.csv?token=AXwBJU16Hn7phgNt_jhrTjSMnn7o5_y7ks5a9y8HwA%3D%3D")
bitcoin <- read.csv(text = url, head = TRUE, sep = ",", stringsAsFactors = FALSE)
bitcoin$ID <- seq.int(nrow(bitcoin))
head(bitcoin)
## date Price volume mktcap avg_blk_size no.trnsactions
## 1 9/1/2010 0.06490 429.8832 251747.1 0.000832290 408
## 2 9/3/2010 0.06340 53.5200 246914.5 0.000530263 314
## 3 9/5/2010 0.06290 507.5544 246039.6 0.000532586 303
## 4 9/7/2010 0.06185 207.4314 242922.1 0.000515358 271
## 5 9/9/2010 0.06240 104.0100 246027.6 0.000665894 343
## 6 9/11/2010 0.06200 464.9484 245482.8 0.000467784 296
## mempoolsize mempoolcount cost.per.trnsaction difficulty trnsct_vol
## 1 NA NA 1.399804 623.3870 688
## 2 NA NA 1.615287 623.3870 1751
## 3 NA NA 1.681485 623.3870 862
## 4 NA NA 1.723127 623.3870 1347
## 5 NA NA 1.373528 712.8849 1015
## 6 NA NA 1.696622 712.8849 556
## hash.rate miner.rev orph.block trnsction.per.blk n_transction
## 1 0.005454017 571.1200 0 1 408
## 2 0.004958197 507.2000 0 1 314
## 3 0.005020175 509.4900 0 2 303
## 4 0.004679299 466.9675 0 1 271
## 5 0.005351093 471.1200 0 2 343
## 6 0.005740907 502.2000 0 2 296
## n_uni_addres trnsct_fee gold ID
## 1 459 0.008437006 1250 1
## 2 363 0.000000000 1252 2
## 3 351 0.000629000 1252 3
## 4 284 0.000000000 1247.25 4
## 5 383 0.006864000 1253.5 5
## 6 328 0.000000000 1248.75 6
dim(bitcoin)
## [1] 1396 20
class(bitcoin)
## [1] "data.frame"
set.seed(101)
sample = sample.split(bitcoin$Price, SplitRatio = .75)
train = subset(bitcoin, sample == TRUE)
test = subset(bitcoin, sample == FALSE)
nums <- train %>% select_if(is.numeric) %>% select(-Price)
#Look at our variables and how correlated they are with price
numscorr <- cor(nums, train$Price)
corrplot(numscorr, method = "number")
corrplot(numscorr, method = "color")
#Just for fun, let's look at all of our non-price variables
#against each other
numscorr2 <- cor(nums)
corrplot(numscorr2, method = "color")
Looking at our correlation plot, we can see that miner.rev has the highest correlation with price. In second place, we have trnsct_vol. Let’s take a look at miner.rev.
library(psych)
## Warning: package 'psych' was built under R version 3.3.3
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
plot(train$Price ~ train$miner.rev)
boxplot(train$Price ~ train$miner.rev)
#by(train$Price, train$miner.rev, mean)
#by(train$Price, train$miner.rev, length)
hist(train$Price, xlab = "Price", main = "Histogram of Price")
plot(train$miner.rev)
describe(train$miner.rev)
## vars n mean sd median trimmed mad min max
## X1 1 1047 2639666 5884538 986961.6 1222465 1372061 387.51 53191582
## range skew kurtosis se
## X1 53191195 4.59 25.1 181860.8
describe(train$Price)
## vars n mean sd median trimmed mad min max range
## X1 1 1047 1073.78 2540.8 264.07 393.25 382.3 0.06 19289.78 19289.72
## skew kurtosis se
## X1 3.79 15.66 78.52
bc <- boxcox(train$Price ~ train$miner.rev)
plot(bc)
numscorr <- cor(nums)
allcorr <- corrplot(numscorr, type = "full", method = "color", sig.level = 0.01, insig = "blank")
numscorr3 <- nums %>%
dplyr::select(hash.rate,miner.rev,trnsction.per.blk) %>%
cor()
invnumscorr3<-solve(numscorr3)
numscorr3 %*% invnumscorr3
## hash.rate miner.rev trnsction.per.blk
## hash.rate 1.000000e+00 1.387779e-16 0.000000e+00
## miner.rev -2.498002e-16 1.000000e+00 -1.110223e-16
## trnsction.per.blk -1.665335e-16 5.551115e-17 1.000000e+00
invnumscorr3 %*% numscorr3
## hash.rate miner.rev trnsction.per.blk
## hash.rate 1.000000e+00 1.942890e-16 5.551115e-17
## miner.rev -3.053113e-16 1.000000e+00 -1.665335e-16
## trnsction.per.blk 0.000000e+00 -1.110223e-16 1.000000e+00
fitdistr(train$miner.rev, densfun = "log-normal")
## meanlog sdlog
## 13.07856997 2.34360987
## ( 0.07242891) ( 0.05121498)
hist(train$miner.rev, main = "Full Miner Revenue Population", xlab = "Miner Revenue")
OQ1000 <- sample(train$miner.rev, 1000)
fitdistr(OQ1000, densfun = "log-normal")
## meanlog sdlog
## 13.09950928 2.33530427
## ( 0.07384881) ( 0.05221899)
hist(OQ1000, main = "Subset of the Miner Revenue Population", xlab = "Miner Revenue")
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.3.3
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:psych':
##
## outlier
## The following object is masked from 'package:dplyr':
##
## combine
library(ModelMetrics)
## Warning: package 'ModelMetrics' was built under R version 3.3.3
train <- read.csv(text = url, head = TRUE, sep = ",", stringsAsFactors = FALSE)
train$ID <- seq.int(nrow(train))
train1 <- train %>% select_if(is.numeric) %>% dplyr::select(-ID)
train1[is.na(train1)] <- -1
fit <- randomForest(Price ~ .,data=train1,
ntree=1100, verbose=T)
summary(fit)
## Length Class Mode
## call 5 -none- call
## type 1 -none- character
## predicted 1396 -none- numeric
## mse 1100 -none- numeric
## rsq 1100 -none- numeric
## oob.times 1396 -none- numeric
## importance 16 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 11 -none- list
## coefs 0 -none- NULL
## y 1396 -none- numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## terms 3 terms call
prd <- predict(fit,train1)
test1 <- test
test1[is.na(test1)] <- -1
testprd <- (predict(fit,test1))
testprd.df <- data.frame(testprd)
testprd.df$ID <- 1:349
names(testprd.df)[names(testprd.df)=="testprd"] <- "Price"
testprd.df <- testprd.df[c(2, 1)]
head(testprd.df)
## ID Price
## 11 1 0.06362292
## 14 2 0.06736925
## 17 3 0.06600988
## 22 4 0.10000977
## 25 5 0.10927971
## 26 6 0.11822413
rmse(train1$Price,prd)
## [1] 79.89275
varImpPlot(fit)