Team Members: Eric Pitruzzella, Kelly Shaffer, Violeta Stoyanova

library(bitops)
library(RCurl)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.3.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.3.3
## corrplot 0.84 loaded
library(caTools)
## Warning: package 'caTools' was built under R version 3.3.3
url <- getURL("https://raw.githubusercontent.com/VioletaStoyanova/data602-finalproject/master/data_bcoin.csv?token=AXwBJU16Hn7phgNt_jhrTjSMnn7o5_y7ks5a9y8HwA%3D%3D")
bitcoin <- read.csv(text = url, head = TRUE, sep = ",", stringsAsFactors = FALSE)

bitcoin$ID <- seq.int(nrow(bitcoin))

head(bitcoin)
##        date   Price   volume   mktcap avg_blk_size no.trnsactions
## 1  9/1/2010 0.06490 429.8832 251747.1  0.000832290            408
## 2  9/3/2010 0.06340  53.5200 246914.5  0.000530263            314
## 3  9/5/2010 0.06290 507.5544 246039.6  0.000532586            303
## 4  9/7/2010 0.06185 207.4314 242922.1  0.000515358            271
## 5  9/9/2010 0.06240 104.0100 246027.6  0.000665894            343
## 6 9/11/2010 0.06200 464.9484 245482.8  0.000467784            296
##   mempoolsize mempoolcount cost.per.trnsaction difficulty trnsct_vol
## 1          NA           NA            1.399804   623.3870        688
## 2          NA           NA            1.615287   623.3870       1751
## 3          NA           NA            1.681485   623.3870        862
## 4          NA           NA            1.723127   623.3870       1347
## 5          NA           NA            1.373528   712.8849       1015
## 6          NA           NA            1.696622   712.8849        556
##     hash.rate miner.rev orph.block trnsction.per.blk n_transction
## 1 0.005454017  571.1200          0                 1          408
## 2 0.004958197  507.2000          0                 1          314
## 3 0.005020175  509.4900          0                 2          303
## 4 0.004679299  466.9675          0                 1          271
## 5 0.005351093  471.1200          0                 2          343
## 6 0.005740907  502.2000          0                 2          296
##   n_uni_addres  trnsct_fee    gold ID
## 1          459 0.008437006    1250  1
## 2          363 0.000000000    1252  2
## 3          351 0.000629000    1252  3
## 4          284 0.000000000 1247.25  4
## 5          383 0.006864000  1253.5  5
## 6          328 0.000000000 1248.75  6
dim(bitcoin)
## [1] 1396   20
class(bitcoin)
## [1] "data.frame"
set.seed(101) 
sample = sample.split(bitcoin$Price, SplitRatio = .75)
train = subset(bitcoin, sample == TRUE)
test  = subset(bitcoin, sample == FALSE)

nums <- train %>% select_if(is.numeric) %>% select(-Price)

#Look at our variables and how correlated they are with price
numscorr <- cor(nums, train$Price)
corrplot(numscorr, method = "number")

corrplot(numscorr, method = "color")

#Just for fun, let's look at all of our non-price variables
#against each other
numscorr2 <- cor(nums)
corrplot(numscorr2, method = "color")

Looking at our correlation plot, we can see that miner.rev has the highest correlation with price. In second place, we have trnsct_vol. Let’s take a look at miner.rev.

library(psych)
## Warning: package 'psych' was built under R version 3.3.3
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
plot(train$Price ~ train$miner.rev)

boxplot(train$Price ~ train$miner.rev)

#by(train$Price, train$miner.rev, mean)
#by(train$Price, train$miner.rev, length)

hist(train$Price, xlab = "Price", main = "Histogram of Price")

plot(train$miner.rev)

describe(train$miner.rev)
##    vars    n    mean      sd   median trimmed     mad    min      max
## X1    1 1047 2639666 5884538 986961.6 1222465 1372061 387.51 53191582
##       range skew kurtosis       se
## X1 53191195 4.59     25.1 181860.8
describe(train$Price)
##    vars    n    mean     sd median trimmed   mad  min      max    range
## X1    1 1047 1073.78 2540.8 264.07  393.25 382.3 0.06 19289.78 19289.72
##    skew kurtosis    se
## X1 3.79    15.66 78.52
bc <- boxcox(train$Price ~ train$miner.rev)

plot(bc)

numscorr <- cor(nums)
allcorr <- corrplot(numscorr, type = "full", method = "color", sig.level = 0.01, insig = "blank")

numscorr3 <- nums %>% 
  dplyr::select(hash.rate,miner.rev,trnsction.per.blk) %>% 
  cor()

invnumscorr3<-solve(numscorr3)

numscorr3 %*% invnumscorr3
##                       hash.rate    miner.rev trnsction.per.blk
## hash.rate          1.000000e+00 1.387779e-16      0.000000e+00
## miner.rev         -2.498002e-16 1.000000e+00     -1.110223e-16
## trnsction.per.blk -1.665335e-16 5.551115e-17      1.000000e+00
invnumscorr3 %*% numscorr3
##                       hash.rate     miner.rev trnsction.per.blk
## hash.rate          1.000000e+00  1.942890e-16      5.551115e-17
## miner.rev         -3.053113e-16  1.000000e+00     -1.665335e-16
## trnsction.per.blk  0.000000e+00 -1.110223e-16      1.000000e+00
fitdistr(train$miner.rev, densfun = "log-normal")
##      meanlog        sdlog   
##   13.07856997    2.34360987 
##  ( 0.07242891) ( 0.05121498)
hist(train$miner.rev, main = "Full Miner Revenue Population", xlab = "Miner Revenue")

OQ1000 <- sample(train$miner.rev, 1000)
fitdistr(OQ1000, densfun = "log-normal")
##      meanlog        sdlog   
##   13.09950928    2.33530427 
##  ( 0.07384881) ( 0.05221899)
hist(OQ1000, main = "Subset of the Miner Revenue Population", xlab = "Miner Revenue")

Modeling

library(randomForest)
## Warning: package 'randomForest' was built under R version 3.3.3
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:psych':
## 
##     outlier
## The following object is masked from 'package:dplyr':
## 
##     combine
library(ModelMetrics)
## Warning: package 'ModelMetrics' was built under R version 3.3.3
train <- read.csv(text = url, head = TRUE, sep = ",", stringsAsFactors = FALSE)

train$ID <- seq.int(nrow(train))

train1 <- train %>% select_if(is.numeric) %>% dplyr::select(-ID)
train1[is.na(train1)] <- -1
fit <- randomForest(Price ~ .,data=train1, 
                    ntree=1100, verbose=T)
summary(fit)
##                 Length Class  Mode     
## call               5   -none- call     
## type               1   -none- character
## predicted       1396   -none- numeric  
## mse             1100   -none- numeric  
## rsq             1100   -none- numeric  
## oob.times       1396   -none- numeric  
## importance        16   -none- numeric  
## importanceSD       0   -none- NULL     
## localImportance    0   -none- NULL     
## proximity          0   -none- NULL     
## ntree              1   -none- numeric  
## mtry               1   -none- numeric  
## forest            11   -none- list     
## coefs              0   -none- NULL     
## y               1396   -none- numeric  
## test               0   -none- NULL     
## inbag              0   -none- NULL     
## terms              3   terms  call
prd <- predict(fit,train1)

test1 <- test

test1[is.na(test1)] <- -1

testprd <- (predict(fit,test1))

testprd.df <- data.frame(testprd)

testprd.df$ID <- 1:349
names(testprd.df)[names(testprd.df)=="testprd"] <- "Price"
testprd.df <- testprd.df[c(2, 1)]

head(testprd.df)
##    ID      Price
## 11  1 0.06362292
## 14  2 0.06736925
## 17  3 0.06600988
## 22  4 0.10000977
## 25  5 0.10927971
## 26  6 0.11822413
rmse(train1$Price,prd)
## [1] 79.89275
varImpPlot(fit)