# Bibliotecas
#----------------------------
library(bitops)
library(RCurl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(corrplot)
## corrplot 0.92 loaded
library(caTools)
# DATAFRAME
#--------------------------
url <- getURL("https://raw.githubusercontent.com/VioletaStoyanova/data602-finalproject/master/data_bcoin.csv?token=AXwBJU16Hn7phgNt_jhrTjSMnn7o5_y7ks5a9y8HwA%3D%3D")
bitcoin <- read.csv(text = url, head = TRUE, sep = ",", stringsAsFactors = FALSE)
bitcoin$ID <- seq.int(nrow(bitcoin))
head(bitcoin)
## date Price volume mktcap avg_blk_size no.trnsactions mempoolsize
## 1 9/1/2010 0.06490 429.8832 251747.1 0.000832290 408 NA
## 2 9/3/2010 0.06340 53.5200 246914.5 0.000530263 314 NA
## 3 9/5/2010 0.06290 507.5544 246039.6 0.000532586 303 NA
## 4 9/7/2010 0.06185 207.4314 242922.1 0.000515358 271 NA
## 5 9/9/2010 0.06240 104.0100 246027.6 0.000665894 343 NA
## 6 9/11/2010 0.06200 464.9484 245482.8 0.000467784 296 NA
## mempoolcount cost.per.trnsaction difficulty trnsct_vol hash.rate miner.rev
## 1 NA 1.399804 623.3870 688 0.005454017 571.1200
## 2 NA 1.615287 623.3870 1751 0.004958197 507.2000
## 3 NA 1.681485 623.3870 862 0.005020175 509.4900
## 4 NA 1.723127 623.3870 1347 0.004679299 466.9675
## 5 NA 1.373528 712.8849 1015 0.005351093 471.1200
## 6 NA 1.696622 712.8849 556 0.005740907 502.2000
## orph.block trnsction.per.blk n_transction n_uni_addres trnsct_fee gold ID
## 1 0 1 408 459 0.008437006 1250 1
## 2 0 1 314 363 0.000000000 1252 2
## 3 0 2 303 351 0.000629000 1252 3
## 4 0 1 271 284 0.000000000 1247.25 4
## 5 0 2 343 383 0.006864000 1253.5 5
## 6 0 2 296 328 0.000000000 1248.75 6
# Dimensión del DF
#--------------
dim(bitcoin)
## [1] 1396 20
# Info
#------------------------------
str(bitcoin)
## 'data.frame': 1396 obs. of 20 variables:
## $ date : chr "9/1/2010" "9/3/2010" "9/5/2010" "9/7/2010" ...
## $ Price : num 0.0649 0.0634 0.0629 0.0619 0.0624 ...
## $ volume : num 429.9 53.5 507.6 207.4 104 ...
## $ mktcap : num 251747 246914 246040 242922 246028 ...
## $ avg_blk_size : num 0.000832 0.00053 0.000533 0.000515 0.000666 ...
## $ no.trnsactions : int 408 314 303 271 343 296 436 450 379 339 ...
## $ mempoolsize : num NA NA NA NA NA NA NA NA NA NA ...
## $ mempoolcount : num NA NA NA NA NA NA NA NA NA NA ...
## $ cost.per.trnsaction: num 1.4 1.62 1.68 1.72 1.37 ...
## $ difficulty : num 623 623 623 623 713 ...
## $ trnsct_vol : num 688 1751 862 1347 1015 ...
## $ hash.rate : num 0.00545 0.00496 0.00502 0.00468 0.00535 ...
## $ miner.rev : num 571 507 509 467 471 ...
## $ orph.block : int 0 0 0 0 0 0 0 0 0 0 ...
## $ trnsction.per.blk : num 1 1 2 1 2 2 1 2 4 2 ...
## $ n_transction : int 408 314 303 271 343 296 436 450 379 339 ...
## $ n_uni_addres : int 459 363 351 284 383 328 489 491 382 379 ...
## $ trnsct_fee : num 0.008437 0 0.000629 0 0.006864 ...
## $ gold : chr "1250" "1252" "1252" "1247.25" ...
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
# Split
#---------------------------------------------------
sample = sample.split(bitcoin$Price, SplitRatio = .80)
train = subset(bitcoin, sample == TRUE)
test = subset(bitcoin, sample == FALSE)
nums <- train %>% select_if(is.numeric) %>% select(-Price)
# Matriz de correlación
#------------------------------------
numscorr <- cor(nums, train$Price)
corrplot(numscorr, method = "number")
# Matriz de correlación
#------------------------------------
numscorr <- cor(nums, train$Price)
corrplot(numscorr, method = "color")
numscorr2 <- cor(nums)
corrplot(numscorr2, method = "color")
library(psych)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
plot(train$Price ~train$miner.rev)
boxplot(train$Price ~train$miner.rev)
hist(train$Price, xlab = "Price", main = "Histogram of Price")
plot(train$miner.rev)
describe(train$miner.rev)
## vars n mean sd median trimmed mad min max range
## X1 1 1116 2564019 5811716 999485.4 1170448 1377537 387.51 53191582 53191195
## skew kurtosis se
## X1 4.74 27.02 173969.2
describe(train$Price)
## vars n mean sd median trimmed mad min max range skew
## X1 1 1116 1040.17 2524.7 265.07 368.68 383.26 0.06 19289.78 19289.72 3.96
## kurtosis se
## X1 17.31 75.58
BOX COX (tipo de regresiones para maximizar la verosimilitud)
bc <- boxcox(train$Price ~train$miner.rev)
numscorr <- cor(nums)
allcorr <- corrplot(numscorr, type ="full", method = "color", sig.level = 0.01, insig = "blank")
numscorr3 <- nums %>%
dplyr::select(hash.rate,miner.rev,trnsction.per.blk) %>%
cor()
invnumscorr3<-solve(numscorr3)
numscorr3 %*% invnumscorr3
## hash.rate miner.rev trnsction.per.blk
## hash.rate 1.000000e+00 1.665335e-16 0.000000e+00
## miner.rev 1.665335e-16 1.000000e+00 1.110223e-16
## trnsction.per.blk 1.110223e-16 5.551115e-17 1.000000e+00
fitdistr(train$miner.rev, densfun = "log-normal")
## meanlog sdlog
## 13.04793479 2.33021590
## ( 0.06975319) ( 0.04932295)
hist(train$miner.rev, main = "Full Miner Revenue Population", xlab = "Miner Revenue")
OQ1000 <- sample(train$miner.rev, 1000)
fitdistr(OQ1000, densfun = "log-normal")
## meanlog sdlog
## 13.05994341 2.32037419
## ( 0.07337667) ( 0.05188514)
hist(OQ1000, main = "Subset of the Miner Revenue Population", xlab = "Miner Revenue")
RANDOM FOREST
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:psych':
##
## outlier
## The following object is masked from 'package:dplyr':
##
## combine
library(ModelMetrics)
##
## Attaching package: 'ModelMetrics'
## The following object is masked from 'package:base':
##
## kappa
train <- read.csv(text = url, head = TRUE, sep = ",", stringsAsFactors = FALSE)
train$ID <- seq.int(nrow(train))
train1 <- train %>% select_if(is.numeric) %>% dplyr::select(-ID)
train1[is.na(train1)] <- -1
fit <- randomForest(Price ~ .,data=train1,
ntree=1100, verbose=T)
summary(fit)
## Length Class Mode
## call 5 -none- call
## type 1 -none- character
## predicted 1396 -none- numeric
## mse 1100 -none- numeric
## rsq 1100 -none- numeric
## oob.times 1396 -none- numeric
## importance 16 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 11 -none- list
## coefs 0 -none- NULL
## y 1396 -none- numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## terms 3 terms call
prd <- predict(fit,train1)
test1 <- test
test1[is.na(test1)] <- -1
testprd <- (predict(fit,test1))
testprd.df <- data.frame(testprd)
testprd.df$ID <- 1:280
names(testprd.df)[names(testprd.df)=="testprd"] <- "Price"
testprd.df <- testprd.df[c(2, 1)]
head(testprd.df)
## ID Price
## 2 1 0.06347210
## 9 2 0.06335408
## 11 3 0.06391456
## 14 4 0.06774162
## 17 5 0.06452431
## 21 6 0.12138752
rmse(train1$Price,prd)
## [1] 80.54156
varImpPlot(fit)