Random Forest Bitcoin

# Bibliotecas
#----------------------------
library(bitops)
library(RCurl)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(corrplot)

## corrplot 0.92 loaded

library(caTools)

# DATAFRAME
#--------------------------

url <- getURL("https://raw.githubusercontent.com/VioletaStoyanova/data602-finalproject/master/data_bcoin.csv?token=AXwBJU16Hn7phgNt_jhrTjSMnn7o5_y7ks5a9y8HwA%3D%3D")

bitcoin <- read.csv(text = url, head = TRUE, sep = ",", stringsAsFactors = FALSE)

bitcoin$ID <- seq.int(nrow(bitcoin))

head(bitcoin)

##        date   Price   volume   mktcap avg_blk_size no.trnsactions mempoolsize
## 1  9/1/2010 0.06490 429.8832 251747.1  0.000832290            408          NA
## 2  9/3/2010 0.06340  53.5200 246914.5  0.000530263            314          NA
## 3  9/5/2010 0.06290 507.5544 246039.6  0.000532586            303          NA
## 4  9/7/2010 0.06185 207.4314 242922.1  0.000515358            271          NA
## 5  9/9/2010 0.06240 104.0100 246027.6  0.000665894            343          NA
## 6 9/11/2010 0.06200 464.9484 245482.8  0.000467784            296          NA
##   mempoolcount cost.per.trnsaction difficulty trnsct_vol   hash.rate miner.rev
## 1           NA            1.399804   623.3870        688 0.005454017  571.1200
## 2           NA            1.615287   623.3870       1751 0.004958197  507.2000
## 3           NA            1.681485   623.3870        862 0.005020175  509.4900
## 4           NA            1.723127   623.3870       1347 0.004679299  466.9675
## 5           NA            1.373528   712.8849       1015 0.005351093  471.1200
## 6           NA            1.696622   712.8849        556 0.005740907  502.2000
##   orph.block trnsction.per.blk n_transction n_uni_addres  trnsct_fee    gold ID
## 1          0                 1          408          459 0.008437006    1250  1
## 2          0                 1          314          363 0.000000000    1252  2
## 3          0                 2          303          351 0.000629000    1252  3
## 4          0                 1          271          284 0.000000000 1247.25  4
## 5          0                 2          343          383 0.006864000  1253.5  5
## 6          0                 2          296          328 0.000000000 1248.75  6

# Dimensión del DF
#--------------
dim(bitcoin)

## [1] 1396   20

# Info
#------------------------------
str(bitcoin)

## 'data.frame':    1396 obs. of  20 variables:
##  $ date               : chr  "9/1/2010" "9/3/2010" "9/5/2010" "9/7/2010" ...
##  $ Price              : num  0.0649 0.0634 0.0629 0.0619 0.0624 ...
##  $ volume             : num  429.9 53.5 507.6 207.4 104 ...
##  $ mktcap             : num  251747 246914 246040 242922 246028 ...
##  $ avg_blk_size       : num  0.000832 0.00053 0.000533 0.000515 0.000666 ...
##  $ no.trnsactions     : int  408 314 303 271 343 296 436 450 379 339 ...
##  $ mempoolsize        : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ mempoolcount       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ cost.per.trnsaction: num  1.4 1.62 1.68 1.72 1.37 ...
##  $ difficulty         : num  623 623 623 623 713 ...
##  $ trnsct_vol         : num  688 1751 862 1347 1015 ...
##  $ hash.rate          : num  0.00545 0.00496 0.00502 0.00468 0.00535 ...
##  $ miner.rev          : num  571 507 509 467 471 ...
##  $ orph.block         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ trnsction.per.blk  : num  1 1 2 1 2 2 1 2 4 2 ...
##  $ n_transction       : int  408 314 303 271 343 296 436 450 379 339 ...
##  $ n_uni_addres       : int  459 363 351 284 383 328 489 491 382 379 ...
##  $ trnsct_fee         : num  0.008437 0 0.000629 0 0.006864 ...
##  $ gold               : chr  "1250" "1252" "1252" "1247.25" ...
##  $ ID                 : int  1 2 3 4 5 6 7 8 9 10 ...

# Split
#---------------------------------------------------
sample = sample.split(bitcoin$Price, SplitRatio = .80)

train = subset(bitcoin, sample == TRUE)
test = subset(bitcoin, sample == FALSE)

nums <- train %>% select_if(is.numeric) %>% select(-Price)

# Matriz de correlación
#------------------------------------
numscorr <- cor(nums, train$Price)
corrplot(numscorr, method = "number")

# Matriz de correlación
#------------------------------------
numscorr <- cor(nums, train$Price)
corrplot(numscorr, method = "color")

numscorr2 <- cor(nums)
corrplot(numscorr2, method = "color")

library(psych)
library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

plot(train$Price ~train$miner.rev)

boxplot(train$Price ~train$miner.rev)

hist(train$Price, xlab = "Price", main = "Histogram of Price")

plot(train$miner.rev)

describe(train$miner.rev)

##    vars    n    mean      sd   median trimmed     mad    min      max    range
## X1    1 1116 2564019 5811716 999485.4 1170448 1377537 387.51 53191582 53191195
##    skew kurtosis       se
## X1 4.74    27.02 173969.2

describe(train$Price)

##    vars    n    mean     sd median trimmed    mad  min      max    range skew
## X1    1 1116 1040.17 2524.7 265.07  368.68 383.26 0.06 19289.78 19289.72 3.96
##    kurtosis    se
## X1    17.31 75.58

BOX COX (tipo de regresiones para maximizar la verosimilitud)

bc <- boxcox(train$Price ~train$miner.rev)

numscorr <- cor(nums)
allcorr <- corrplot(numscorr, type ="full", method = "color", sig.level = 0.01, insig = "blank")

numscorr3 <- nums %>%
  dplyr::select(hash.rate,miner.rev,trnsction.per.blk) %>%
  cor()

invnumscorr3<-solve(numscorr3)

numscorr3 %*% invnumscorr3

##                      hash.rate    miner.rev trnsction.per.blk
## hash.rate         1.000000e+00 1.665335e-16      0.000000e+00
## miner.rev         1.665335e-16 1.000000e+00      1.110223e-16
## trnsction.per.blk 1.110223e-16 5.551115e-17      1.000000e+00

fitdistr(train$miner.rev, densfun = "log-normal")

##      meanlog        sdlog   
##   13.04793479    2.33021590 
##  ( 0.06975319) ( 0.04932295)

hist(train$miner.rev, main = "Full Miner Revenue Population", xlab = "Miner Revenue")

OQ1000 <- sample(train$miner.rev, 1000)
fitdistr(OQ1000, densfun = "log-normal")

##      meanlog        sdlog   
##   13.05994341    2.32037419 
##  ( 0.07337667) ( 0.05188514)

hist(OQ1000, main = "Subset of the Miner Revenue Population", xlab = "Miner Revenue")

RANDOM FOREST

library(randomForest)

## randomForest 4.7-1.1

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:psych':
## 
##     outlier

## The following object is masked from 'package:dplyr':
## 
##     combine

library(ModelMetrics)

## 
## Attaching package: 'ModelMetrics'

## The following object is masked from 'package:base':
## 
##     kappa

train <- read.csv(text = url, head = TRUE, sep = ",", stringsAsFactors = FALSE)

train$ID <- seq.int(nrow(train))

train1 <- train %>% select_if(is.numeric) %>% dplyr::select(-ID)
train1[is.na(train1)] <- -1

fit <- randomForest(Price ~ .,data=train1, 
                    ntree=1100, verbose=T)
summary(fit)

##                 Length Class  Mode     
## call               5   -none- call     
## type               1   -none- character
## predicted       1396   -none- numeric  
## mse             1100   -none- numeric  
## rsq             1100   -none- numeric  
## oob.times       1396   -none- numeric  
## importance        16   -none- numeric  
## importanceSD       0   -none- NULL     
## localImportance    0   -none- NULL     
## proximity          0   -none- NULL     
## ntree              1   -none- numeric  
## mtry               1   -none- numeric  
## forest            11   -none- list     
## coefs              0   -none- NULL     
## y               1396   -none- numeric  
## test               0   -none- NULL     
## inbag              0   -none- NULL     
## terms              3   terms  call

prd <- predict(fit,train1)

test1 <- test

test1[is.na(test1)] <- -1

testprd <- (predict(fit,test1))

testprd.df <- data.frame(testprd)

testprd.df$ID <- 1:280

names(testprd.df)[names(testprd.df)=="testprd"] <- "Price"
testprd.df <- testprd.df[c(2, 1)]

head(testprd.df)

##    ID      Price
## 2   1 0.06347210
## 9   2 0.06335408
## 11  3 0.06391456
## 14  4 0.06774162
## 17  5 0.06452431
## 21  6 0.12138752

rmse(train1$Price,prd)

## [1] 80.54156

varImpPlot(fit)

Random Forest Bitcoin

2024-01-29