Getting rid index and unnamed columns imported from python
sample <- read.csv('Sample_Data.csv')
sample <- sample[,-1]
sample <- sample[,-17]
sample <- na.omit(sample)
scaling data (time, block number, hash, and category eur- night should not be used in scaling, nor should the target gas price)
drop_columns <- c(1, 2, 3, 4, 15)
sample_scale <- sample[,-drop_columns]
sample_scale <- scale(sample_scale)
Adding back in eur-night and changing it categorical, and adding back in the target gas
sample_scale <- as.data.frame(sample_scale)
sample_scale$Eur_night <- as.factor(sample$Eur_night)
sample_scale$target <- sample$gas_price
Full regression analysis on gas price as response
model_data <- lm(target~., sample_scale)
summary(model_data)
##
## Call:
## lm(formula = target ~ ., data = sample_scale)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12498 -37 -15 13 32414
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 101.8360 0.6454 157.796 < 2e-16 ***
## receipt_gas_used -24.9412 0.4227 -59.001 < 2e-16 ***
## totalfee 118.9964 0.4231 281.271 < 2e-16 ***
## past_min 0.2896 1.0270 0.282 0.777938
## past_max -0.2992 0.6142 -0.487 0.626117
## past_median 7.6473 1.4800 5.167 2.38e-07 ***
## blck_med 0.1117 1.2334 0.091 0.927837
## blck_min -0.3582 0.6235 -0.574 0.565668
## blck_max 1.7639 0.5288 3.336 0.000851 ***
## blck_totalfee 2.5774 1.1555 2.231 0.025710 *
## blck_count 0.9412 0.4275 2.202 0.027695 *
## eth_high 33.8273 0.5362 63.083 < 2e-16 ***
## Eur_night1 1.4770 0.8428 1.752 0.079699 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 183.2 on 196156 degrees of freedom
## Multiple R-squared: 0.3208, Adjusted R-squared: 0.3208
## F-statistic: 7722 on 12 and 196156 DF, p-value: < 2.2e-16
Getting rid of na data and uncessary columns
drop_columns2 <- c(6:10)
sample_scale2 <- sample_scale[,-drop_columns2]
model2 <- lm(target~., sample_scale2)
summary(model2)
##
## Call:
## lm(formula = target ~ ., data = sample_scale2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12508 -37 -15 13 32413
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 101.76256 0.64467 157.853 <2e-16 ***
## receipt_gas_used -24.97888 0.42262 -59.104 <2e-16 ***
## totalfee 119.02510 0.42308 281.329 <2e-16 ***
## past_min 0.27580 1.02306 0.270 0.7875
## past_max -0.09118 0.61170 -0.149 0.8815
## past_median 9.82554 1.18567 8.287 <2e-16 ***
## eth_high 33.84553 0.53182 63.641 <2e-16 ***
## Eur_night1 1.60197 0.84123 1.904 0.0569 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 183.3 on 196161 degrees of freedom
## Multiple R-squared: 0.3207, Adjusted R-squared: 0.3206
## F-statistic: 1.323e+04 on 7 and 196161 DF, p-value: < 2.2e-16
qqnorm(rstudent(model2))
abline(0, 1, col = "red")
plot(model2$fitted.values, rstudent(model2), ylim = c(-5, 5))
abline(3,0, col = "red")
abline(-3, 0, col = "red")
looking at leverage
k = 7
n = length(sample_scale2$target)
plot(hatvalues(model2),rstudent(model2),
xlab="hat values", ylab="studentized residuals",
pch=16, main="Residual vs leverage",
cex=1.5, cex.main=1.5, cex.lab=1.5, cex.axis=1.5)
abline(v=2*(k+1)/n, col="red",lwd=3, lty=2)
abline(h=-3, col="red",lwd=3, lty=2)
abline(h=3, col="red",lwd=3, lty=2)
leverage <- which(hatvalues(model2) > 2*(k+1)/n & (rstudent(model2) > 3 | rstudent(model2) < -3 ))
gas_levearge <- sample_scale2[leverage,]
gas_levearge
summary(gas_levearge$target)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 12.0 111.8 1298.9 4058.1 5075.1 34912.0
Just looking at outliers for residuals between 3 and -3
outliers <- which((rstudent(model2) > 3 | rstudent(model2) < -3 ))
gas_outliers <- sample_scale2[outliers,]
summary(gas_outliers)
## receipt_gas_used totalfee past_min past_max
## Min. :-0.37196 Min. : 0.0375 Min. :-1.12499 Min. :-0.879027
## 1st Qu.:-0.31856 1st Qu.: 0.3082 1st Qu.:-0.51187 1st Qu.:-0.425772
## Median :-0.26557 Median : 0.6669 Median :-0.07212 Median :-0.007797
## Mean : 2.52735 Mean : 3.1128 Mean : 0.21563 Mean : 0.173870
## 3rd Qu.: 0.04586 3rd Qu.: 1.8969 3rd Qu.: 0.66478 3rd Qu.: 0.451188
## Max. :82.08100 Max. :398.5944 Max. :12.49627 Max. :10.894746
## past_median eth_high Eur_night target
## Min. :-0.97367 Min. :-1.1048 0:209 Min. : 12.0
## 1st Qu.:-0.50020 1st Qu.:-0.6685 1:318 1st Qu.: 806.2
## Median : 0.03345 Median : 0.3776 Median : 1039.4
## Mean : 0.24611 Mean : 0.3388 Mean : 2034.7
## 3rd Qu.: 0.73053 3rd Qu.: 1.4572 3rd Qu.: 1744.4
## Max. :12.42837 Max. : 2.0662 Max. :34912.0
length(gas_outliers$target)
## [1] 527
hist(gas_outliers$target)
gas_outliers
Looking at these with original data
summary(sample[outliers,])
## time_stamp block_number hash gas_price
## Length:527 Min. :10974925 Min. :5.786e+73 Min. : 12.0
## Class :character 1st Qu.:11338118 1st Qu.:2.789e+76 1st Qu.: 806.2
## Mode :character Median :11602472 Median :5.801e+76 Median : 1039.4
## Mean :11569641 Mean :5.884e+76 Mean : 2034.7
## 3rd Qu.:11849119 3rd Qu.:8.951e+76 3rd Qu.: 1744.4
## Max. :11947143 Max. :1.157e+77 Max. :34912.0
## receipt_gas_used totalfee past_min past_max
## Min. : 13116 Min. :8.538e+06 Min. : 3.299 Min. : 27.44
## 1st Qu.: 21000 1st Qu.:2.212e+07 1st Qu.: 38.638 1st Qu.: 90.82
## Median : 28823 Median :4.011e+07 Median : 63.985 Median : 149.26
## Mean : 441183 Mean :1.628e+08 Mean : 80.570 Mean : 174.66
## 3rd Qu.: 74804 3rd Qu.:1.018e+08 3rd Qu.:106.459 3rd Qu.: 213.44
## Max. :12186847 Max. :2.000e+10 Max. :788.409 Max. :1673.71
## past_median blck_med blck_min blck_max
## Min. : 20.17 Min. : 1.0 Min. : 0.00 Min. : 31.0
## 1st Qu.: 58.31 1st Qu.: 50.0 1st Qu.: 15.05 1st Qu.: 212.6
## Median : 101.30 Median : 92.3 Median : 52.80 Median : 355.0
## Mean : 118.43 Mean : 112.9 Mean : 72.99 Mean : 949.5
## 3rd Qu.: 157.45 3rd Qu.: 150.5 3rd Qu.:115.00 3rd Qu.: 700.0
## Max. :1099.74 Max. :1437.5 Max. :700.00 Max. :30000.0
## blck_totalfee blck_count Eur_night eth_high
## Min. :5.935e+07 Min. : 8.0 Min. :0.0000 Min. : 342.6
## 1st Qu.:7.732e+08 1st Qu.:166.0 1st Qu.:0.0000 1st Qu.: 575.6
## Median :1.337e+09 Median :204.0 Median :1.0000 Median :1134.3
## Mean :1.648e+09 Mean :209.4 Mean :0.6034 Mean :1113.6
## 3rd Qu.:2.046e+09 3rd Qu.:241.5 3rd Qu.:1.0000 3rd Qu.:1711.0
## Max. :3.353e+10 Max. :582.0 Max. :1.0000 Max. :2036.3
summary(sample)
## time_stamp block_number hash gas_price
## Length:196169 Min. :10966876 Min. :4.411e+71 Min. : 0.00
## Class :character 1st Qu.:11244657 1st Qu.:2.885e+76 1st Qu.: 40.96
## Mode :character Median :11490543 Median :5.799e+76 Median : 72.00
## Mean :11481572 Mean :5.789e+76 Mean : 102.70
## 3rd Qu.:11725084 3rd Qu.:8.688e+76 3rd Qu.: 126.00
## Max. :11948958 Max. :1.158e+77 Max. :34912.03
## receipt_gas_used totalfee past_min past_max
## Min. : 13013 Min. :0.000e+00 Min. : 0.00 Min. : 18.05
## 1st Qu.: 21000 1st Qu.:1.428e+06 1st Qu.: 31.07 1st Qu.: 73.66
## Median : 37793 Median :2.966e+06 Median : 51.80 Median : 115.72
## Mean : 68033 Mean :6.654e+06 Mean : 68.14 Mean : 150.35
## 3rd Qu.: 58655 3rd Qu.:6.453e+06 3rd Qu.: 87.54 3rd Qu.: 188.32
## Max. :12186847 Max. :2.000e+10 Max. :1330.40 Max. :14309.59
## past_median blck_med blck_min blck_max
## Min. : 13.00 Min. : 0.00 Min. : 0.00 Min. : 1.0
## 1st Qu.: 47.58 1st Qu.: 40.00 1st Qu.: 16.00 1st Qu.: 166.7
## Median : 74.13 Median : 68.00 Median : 42.14 Median : 260.0
## Mean : 98.60 Mean : 91.78 Mean : 60.58 Mean : 618.1
## 3rd Qu.: 126.22 3rd Qu.: 118.00 3rd Qu.: 84.00 3rd Qu.: 500.0
## Max. :1823.15 Max. :3888.00 Max. :1400.45 Max. :210000.0
## blck_totalfee blck_count Eur_night eth_high
## Min. :5.630e+06 Min. : 5.0 Min. :0.0000 Min. : 342.6
## 1st Qu.:5.934e+08 1st Qu.:163.0 1st Qu.:0.0000 1st Qu.: 465.7
## Median :9.653e+08 Median :196.0 Median :1.0000 Median : 637.1
## Mean :1.280e+09 Mean :202.3 Mean :0.5877 Mean : 932.7
## 3rd Qu.:1.624e+09 3rd Qu.:232.0 3rd Qu.:1.0000 3rd Qu.:1378.9
## Max. :9.682e+10 Max. :596.0 Max. :1.0000 Max. :2036.3
looking at prices below 800
below800 <- which(sample_scale2[outliers,]$target < 800)
summary(sample_scale2[below800,])
## receipt_gas_used totalfee past_min past_max
## Min. :-0.32921 Min. :-0.13221 Min. :-0.9632 Min. :-0.5498
## 1st Qu.:-0.31856 1st Qu.:-0.09957 1st Qu.:-0.2879 1st Qu.:-0.3810
## Median :-0.20660 Median :-0.07216 Median :-0.1097 Median :-0.2349
## Mean :-0.06736 Mean :-0.04129 Mean :-0.1285 Mean : 0.1375
## 3rd Qu.:-0.07818 3rd Qu.:-0.02342 3rd Qu.: 0.0487 3rd Qu.: 0.1378
## Max. : 4.06131 Max. : 0.68181 Max. : 0.3290 Max. : 6.1526
## past_median eth_high Eur_night target
## Min. :-0.5471 Min. :-1.055 0:99 Min. : 1.00
## 1st Qu.:-0.2995 1st Qu.:-1.055 1:25 1st Qu.: 66.00
## Median :-0.2173 Median :-1.055 Median : 77.00
## Mean :-0.2075 Mean :-1.055 Mean : 79.28
## 3rd Qu.:-0.1319 3rd Qu.:-1.055 3rd Qu.: 84.50
## Max. : 0.3336 Max. :-1.055 Max. :400.00
Regular data
sample[below800,]
Looking at prices above 800
above800 <- which(sample_scale2[outliers,]$target > 800)
summary(sample_scale2[above800,])
## receipt_gas_used totalfee past_min past_max
## Min. :-0.31958 Min. :-0.132214 Min. :-1.04704 Min. :-0.59702
## 1st Qu.:-0.31856 1st Qu.:-0.098729 1st Qu.:-0.26867 1st Qu.:-0.40471
## Median :-0.18176 Median :-0.069662 Median :-0.12614 Median :-0.26463
## Mean : 0.05829 Mean :-0.007382 Mean :-0.13227 Mean : 0.04212
## 3rd Qu.: 0.08070 3rd Qu.:-0.000528 3rd Qu.: 0.05198 3rd Qu.: 0.16473
## Max. :17.01453 Max. : 4.238675 Max. : 0.50680 Max. : 6.15262
## past_median eth_high Eur_night target
## Min. :-0.5546 Min. :-1.055 0:230 Min. : 1.00
## 1st Qu.:-0.3032 1st Qu.:-1.055 1:167 1st Qu.: 67.00
## Median :-0.2076 Median :-1.055 Median : 77.00
## Mean :-0.1918 Mean :-1.055 Mean : 80.73
## 3rd Qu.:-0.1148 3rd Qu.:-1.055 3rd Qu.: 86.90
## Max. : 0.5170 Max. :-1.055 Max. :390.00
Refitting with everything above 800 gone, and everything that is 0 gwei
outliers2 <- which(sample_scale2$target > 800 | sample_scale2$target == 0)
sample_scale3 <- sample_scale2[-outliers2,]
model3 <- lm(target~.,sample_scale3)
summary(model3)
##
## Call:
## lm(formula = target ~ ., data = sample_scale3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6364.2 -32.0 -10.9 16.1 993.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 97.8664 0.2426 403.404 < 2e-16 ***
## receipt_gas_used -27.6398 0.2059 -134.232 < 2e-16 ***
## totalfee 137.8079 0.7032 195.978 < 2e-16 ***
## past_min 0.7736 0.3851 2.009 0.0446 *
## past_max -0.2865 0.2305 -1.243 0.2138
## past_median 8.9014 0.4462 19.950 < 2e-16 ***
## eth_high 32.0375 0.2033 157.604 < 2e-16 ***
## Eur_night1 1.4574 0.3165 4.605 4.13e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 68.86 on 195694 degrees of freedom
## Multiple R-squared: 0.398, Adjusted R-squared: 0.398
## F-statistic: 1.848e+04 on 7 and 195694 DF, p-value: < 2.2e-16
plot(model3)
Refit using step using model with outliers
library(MASS)
step(model2, direction = "backward", k = 2)
## Start: AIC=2044436
## target ~ receipt_gas_used + totalfee + past_min + past_max +
## past_median + eth_high + Eur_night
##
## Df Sum of Sq RSS AIC
## - past_max 1 746 6587641090 2044434
## - past_min 1 2441 6587642785 2044434
## <none> 6587640344 2044436
## - Eur_night 1 121787 6587762131 2044438
## - past_median 1 2306217 6589946561 2044503
## - receipt_gas_used 1 117315057 6704955401 2047897
## - eth_high 1 136015812 6723656156 2048443
## - totalfee 1 2657937495 9245577839 2110926
##
## Step: AIC=2044434
## target ~ receipt_gas_used + totalfee + past_min + past_median +
## eth_high + Eur_night
##
## Df Sum of Sq RSS AIC
## - past_min 1 2605 6587643695 2044432
## <none> 6587641090 2044434
## - Eur_night 1 121889 6587762979 2044436
## - past_median 1 2840489 6590481579 2044517
## - receipt_gas_used 1 117317916 6704959006 2047895
## - eth_high 1 137236599 6724877689 2048477
## - totalfee 1 2657943379 9245584469 2110924
##
## Step: AIC=2044432
## target ~ receipt_gas_used + totalfee + past_median + eth_high +
## Eur_night
##
## Df Sum of Sq RSS AIC
## <none> 6587643695 2044432
## - Eur_night 1 121899 6587765594 2044434
## - past_median 1 12013659 6599657354 2044788
## - receipt_gas_used 1 117318326 6704962021 2047893
## - eth_high 1 137587967 6725231662 2048485
## - totalfee 1 2657942694 9245586389 2110922
##
## Call:
## lm(formula = target ~ receipt_gas_used + totalfee + past_median +
## eth_high + Eur_night, data = sample_scale2)
##
## Coefficients:
## (Intercept) receipt_gas_used totalfee past_median
## 101.762 -24.979 119.025 10.002
## eth_high Eur_night1
## 33.860 1.603
basemodel <- lm(target~1,data=sample_scale2)
step(basemodel, scope=list(upper=model2,lower=~1),
direction="forward", k=2)
## Start: AIC=2120269
## target ~ 1
##
## Df Sum of Sq RSS AIC
## + totalfee 1 2650171422 7046992553 2057647
## + eth_high 1 434062350 9263101625 2111287
## + past_median 1 262159900 9435004075 2114894
## + past_min 1 227312505 9469851471 2115617
## + past_max 1 121624046 9575539930 2117795
## + Eur_night 1 1480981 9695682994 2120241
## + receipt_gas_used 1 998076 9696165899 2120250
## <none> 9697163975 2120269
##
## Step: AIC=2057647
## target ~ totalfee
##
## Df Sum of Sq RSS AIC
## + eth_high 1 329141901 6717850652 2048266
## + past_median 1 197889271 6849103282 2052062
## + past_min 1 171743340 6875249213 2052809
## + receipt_gas_used 1 133084294 6913908259 2053909
## + past_max 1 89969894 6957022659 2055128
## + Eur_night 1 952756 7046039797 2057623
## <none> 7046992553 2057647
##
## Step: AIC=2048266
## target ~ totalfee + eth_high
##
## Df Sum of Sq RSS AIC
## + receipt_gas_used 1 117985277 6599865375 2044792
## + past_median 1 12712356 6705138296 2047896
## + past_min 1 9719683 6708130969 2047984
## + past_max 1 5499302 6712351350 2048107
## + Eur_night 1 280514 6717570138 2048260
## <none> 6717850652 2048266
##
## Step: AIC=2044792
## target ~ totalfee + eth_high + receipt_gas_used
##
## Df Sum of Sq RSS AIC
## + past_median 1 12099781 6587765594 2044434
## + past_min 1 9243432 6590621943 2044519
## + past_max 1 5342310 6594523065 2044635
## + Eur_night 1 208021 6599657354 2044788
## <none> 6599865375 2044792
##
## Step: AIC=2044434
## target ~ totalfee + eth_high + receipt_gas_used + past_median
##
## Df Sum of Sq RSS AIC
## + Eur_night 1 121899 6587643695 2044432
## <none> 6587765594 2044434
## + past_min 1 2615 6587762979 2044436
## + past_max 1 1023 6587764571 2044436
##
## Step: AIC=2044432
## target ~ totalfee + eth_high + receipt_gas_used + past_median +
## Eur_night
##
## Df Sum of Sq RSS AIC
## <none> 6587643695 2044432
## + past_min 1 2604.58 6587641090 2044434
## + past_max 1 909.99 6587642785 2044434
##
## Call:
## lm(formula = target ~ totalfee + eth_high + receipt_gas_used +
## past_median + Eur_night, data = sample_scale2)
##
## Coefficients:
## (Intercept) totalfee eth_high receipt_gas_used
## 101.762 119.025 33.860 -24.979
## past_median Eur_night1
## 10.002 1.603
Both models are identical (totalfee, ethhigh, receipt_gas, past_median, eurnight )
drop_columns3 <- c(3,4)
sample_scale4 <- sample_scale2[,-drop_columns3]
model4 <- lm(target~., sample_scale4)
summary(model4)
##
## Call:
## lm(formula = target ~ ., data = sample_scale4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12508 -37 -15 13 32413
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 101.7621 0.6447 157.854 <2e-16 ***
## receipt_gas_used -24.9791 0.4226 -59.105 <2e-16 ***
## totalfee 119.0250 0.4231 281.330 <2e-16 ***
## past_median 10.0020 0.5288 18.914 <2e-16 ***
## eth_high 33.8597 0.5290 64.008 <2e-16 ***
## Eur_night1 1.6027 0.8412 1.905 0.0568 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 183.3 on 196163 degrees of freedom
## Multiple R-squared: 0.3207, Adjusted R-squared: 0.3206
## F-statistic: 1.852e+04 on 5 and 196163 DF, p-value: < 2.2e-16
qqnorm(rstudent(model4))
abline(0, 1, col = "red")
plot(model4$fitted.values, rstudent(model4), ylim = c(-5, 5))
abline(3,0, col = "red")
abline(-3, 0, col = "red")
k2 = 5
n = length(sample_scale4$target)
plot(hatvalues(model4),rstudent(model4),
xlab="hat values", ylab="studentized residuals",
pch=16, main="Residual vs leverage",
cex=1.5, cex.main=1.5, cex.lab=1.5, cex.axis=1.5)
abline(v=2*(k2+1)/n, col="red",lwd=3, lty=2)
abline(h=-3, col="red",lwd=3, lty=2)
abline(h=3, col="red",lwd=3, lty=2)
Looking at leverage
leverage2 <- which(hatvalues(model4) > 2*(k2+1)/n & (rstudent(model4) > 3 | rstudent(model4) < -3 ))
gas_levearge2 <- sample_scale4[leverage2,]
gas_levearge2
non-scaled values
sample[leverage2,]
summary(sample[leverage2,])
## time_stamp block_number hash gas_price
## Length:117 Min. :10974925 Min. :6.082e+73 Min. : 12.0
## Class :character 1st Qu.:11253924 1st Qu.:2.550e+76 1st Qu.: 113.5
## Mode :character Median :11531989 Median :5.595e+76 Median : 1945.8
## Mean :11514722 Mean :5.818e+76 Mean : 4220.0
## 3rd Qu.:11798673 3rd Qu.:9.197e+76 3rd Qu.: 5555.0
## Max. :11947143 Max. :1.157e+77 Max. :34912.0
## receipt_gas_used totalfee past_min past_max
## Min. : 13247 Min. :1.854e+07 Min. : 11.65 Min. : 27.44
## 1st Qu.: 62494 1st Qu.:9.289e+07 1st Qu.: 30.29 1st Qu.: 79.31
## Median : 126331 Median :1.950e+08 Median : 61.92 Median : 145.83
## Mean : 1859190 Mean :5.722e+08 Mean : 97.02 Mean : 212.09
## 3rd Qu.: 5110271 3rd Qu.:4.689e+08 3rd Qu.:127.45 3rd Qu.: 224.32
## Max. :12186847 Max. :2.000e+10 Max. :788.41 Max. :1403.99
## past_median blck_med blck_min blck_max
## Min. : 20.91 Min. : 9.0 Min. : 0.00 Min. : 31.0
## 1st Qu.: 48.79 1st Qu.: 42.0 1st Qu.: 15.00 1st Qu.: 200.0
## Median : 89.44 Median : 82.5 Median : 41.00 Median : 325.2
## Mean : 141.69 Mean : 137.4 Mean : 79.17 Mean : 1375.1
## 3rd Qu.: 174.56 3rd Qu.: 159.3 3rd Qu.:116.00 3rd Qu.: 700.0
## Max. :1099.74 Max. :1437.5 Max. :700.00 Max. :30000.0
## blck_totalfee blck_count Eur_night eth_high
## Min. :1.940e+08 Min. : 8 Min. :0.0000 Min. : 351.6
## 1st Qu.:6.205e+08 1st Qu.:146 1st Qu.:0.0000 1st Qu.: 467.7
## Median :1.190e+09 Median :200 Median :1.0000 Median : 668.8
## Mean :2.159e+09 Mean :199 Mean :0.5299 Mean :1017.7
## 3rd Qu.:2.196e+09 3rd Qu.:245 3rd Qu.:1.0000 3rd Qu.:1670.2
## Max. :3.353e+10 Max. :444 Max. :1.0000 Max. :2036.3
Looking at outliers
outliers2 <- which((rstudent(model4) > 3 | rstudent(model4) < -3 ))
gas_outliers2 <- sample_scale4[outliers2,]
summary(gas_outliers2)
## receipt_gas_used totalfee past_median eth_high
## Min. :-0.37196 Min. : 0.0375 Min. :-0.97367 Min. :-1.1048
## 1st Qu.:-0.31856 1st Qu.: 0.3082 1st Qu.:-0.50020 1st Qu.:-0.6685
## Median :-0.26557 Median : 0.6669 Median : 0.03345 Median : 0.3776
## Mean : 2.52735 Mean : 3.1128 Mean : 0.24611 Mean : 0.3388
## 3rd Qu.: 0.04586 3rd Qu.: 1.8969 3rd Qu.: 0.73053 3rd Qu.: 1.4572
## Max. :82.08100 Max. :398.5944 Max. :12.42837 Max. : 2.0662
## Eur_night target
## 0:209 Min. : 12.0
## 1:318 1st Qu.: 806.2
## Median : 1039.4
## Mean : 2034.7
## 3rd Qu.: 1744.4
## Max. :34912.0
length(gas_outliers2$target)
## [1] 527
hist(gas_outliers2$target)
gas_outliers2
nonscaled data
sample[outliers2,]
summary(sample[outliers2,])
## time_stamp block_number hash gas_price
## Length:527 Min. :10974925 Min. :5.786e+73 Min. : 12.0
## Class :character 1st Qu.:11338118 1st Qu.:2.789e+76 1st Qu.: 806.2
## Mode :character Median :11602472 Median :5.801e+76 Median : 1039.4
## Mean :11569641 Mean :5.884e+76 Mean : 2034.7
## 3rd Qu.:11849119 3rd Qu.:8.951e+76 3rd Qu.: 1744.4
## Max. :11947143 Max. :1.157e+77 Max. :34912.0
## receipt_gas_used totalfee past_min past_max
## Min. : 13116 Min. :8.538e+06 Min. : 3.299 Min. : 27.44
## 1st Qu.: 21000 1st Qu.:2.212e+07 1st Qu.: 38.638 1st Qu.: 90.82
## Median : 28823 Median :4.011e+07 Median : 63.985 Median : 149.26
## Mean : 441183 Mean :1.628e+08 Mean : 80.570 Mean : 174.66
## 3rd Qu.: 74804 3rd Qu.:1.018e+08 3rd Qu.:106.459 3rd Qu.: 213.44
## Max. :12186847 Max. :2.000e+10 Max. :788.409 Max. :1673.71
## past_median blck_med blck_min blck_max
## Min. : 20.17 Min. : 1.0 Min. : 0.00 Min. : 31.0
## 1st Qu.: 58.31 1st Qu.: 50.0 1st Qu.: 15.05 1st Qu.: 212.6
## Median : 101.30 Median : 92.3 Median : 52.80 Median : 355.0
## Mean : 118.43 Mean : 112.9 Mean : 72.99 Mean : 949.5
## 3rd Qu.: 157.45 3rd Qu.: 150.5 3rd Qu.:115.00 3rd Qu.: 700.0
## Max. :1099.74 Max. :1437.5 Max. :700.00 Max. :30000.0
## blck_totalfee blck_count Eur_night eth_high
## Min. :5.935e+07 Min. : 8.0 Min. :0.0000 Min. : 342.6
## 1st Qu.:7.732e+08 1st Qu.:166.0 1st Qu.:0.0000 1st Qu.: 575.6
## Median :1.337e+09 Median :204.0 Median :1.0000 Median :1134.3
## Mean :1.648e+09 Mean :209.4 Mean :0.6034 Mean :1113.6
## 3rd Qu.:2.046e+09 3rd Qu.:241.5 3rd Qu.:1.0000 3rd Qu.:1711.0
## Max. :3.353e+10 Max. :582.0 Max. :1.0000 Max. :2036.3
First quartile gas is the same for outliers
What gas price would look like without outliers
summary(sample[-outliers2,])
## time_stamp block_number hash gas_price
## Length:195642 Min. :10966876 Min. :4.411e+71 Min. : 0.0
## Class :character 1st Qu.:11244466 1st Qu.:2.885e+76 1st Qu.: 40.7
## Mode :character Median :11490178 Median :5.799e+76 Median : 72.0
## Mean :11481334 Mean :5.789e+76 Mean : 97.5
## 3rd Qu.:11724876 3rd Qu.:8.688e+76 3rd Qu.: 125.0
## Max. :11948958 Max. :1.158e+77 Max. :3139.4
## receipt_gas_used totalfee past_min past_max
## Min. : 13013 Min. :0.000e+00 Min. : 0.00 Min. : 18.05
## 1st Qu.: 21000 1st Qu.:1.428e+06 1st Qu.: 31.06 1st Qu.: 73.64
## Median : 37900 Median :2.960e+06 Median : 51.77 Median : 115.65
## Mean : 67028 Mean :6.234e+06 Mean : 68.11 Mean : 150.29
## 3rd Qu.: 58384 3rd Qu.:6.402e+06 3rd Qu.: 87.50 3rd Qu.: 188.19
## Max. :7672865 Max. :1.104e+09 Max. :1330.40 Max. :14309.59
## past_median blck_med blck_min blck_max
## Min. : 13.00 Min. : 0.00 Min. : 0.00 Min. : 1.0
## 1st Qu.: 47.56 1st Qu.: 40.00 1st Qu.: 16.00 1st Qu.: 166.7
## Median : 74.08 Median : 68.00 Median : 42.00 Median : 259.5
## Mean : 98.55 Mean : 91.73 Mean : 60.54 Mean : 617.2
## 3rd Qu.: 126.12 3rd Qu.: 118.00 3rd Qu.: 84.00 3rd Qu.: 500.0
## Max. :1823.15 Max. :3888.00 Max. :1400.45 Max. :210000.0
## blck_totalfee blck_count Eur_night eth_high
## Min. :5.630e+06 Min. : 5.0 Min. :0.0000 Min. : 342.6
## 1st Qu.:5.930e+08 1st Qu.:163.0 1st Qu.:0.0000 1st Qu.: 465.7
## Median :9.647e+08 Median :196.0 Median :1.0000 Median : 637.1
## Mean :1.279e+09 Mean :202.3 Mean :0.5876 Mean : 932.2
## 3rd Qu.:1.622e+09 3rd Qu.:232.0 3rd Qu.:1.0000 3rd Qu.:1378.9
## Max. :9.682e+10 Max. :596.0 Max. :1.0000 Max. :2036.3
Looking at how much data is gas price over 800
sum(sample$gas_price > 800)/length(sample$gas_price)
## [1] 0.002202183
sum(sample$gas_price > 600)/length(sample$gas_price)
## [1] 0.005683875
remove all gas prices over 600
over600 <- which(sample_scale2$target > 600 | sample_scale2$target == 0)
sample_scale5 <- sample_scale2[-over600,]
refit model
model5 <- lm(target~., sample_scale5)
summary(model5)
##
## Call:
## lm(formula = target ~ ., data = sample_scale5)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5633.4 -31.1 -10.3 17.3 875.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 96.10637 0.22366 429.704 < 2e-16 ***
## receipt_gas_used -24.47749 0.19265 -127.059 < 2e-16 ***
## totalfee 122.33204 0.67673 180.770 < 2e-16 ***
## past_min 0.69243 0.35589 1.946 0.0517 .
## past_max -0.08713 0.21222 -0.411 0.6814
## past_median 8.60782 0.41193 20.896 < 2e-16 ***
## eth_high 31.83786 0.18768 169.640 < 2e-16 ***
## Eur_night1 1.57567 0.29169 5.402 6.6e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 63.36 on 195011 degrees of freedom
## Multiple R-squared: 0.4048, Adjusted R-squared: 0.4048
## F-statistic: 1.895e+04 on 7 and 195011 DF, p-value: < 2.2e-16
Find best fit - backward
step(model5, direction = "backward", k = 2)
## Start: AIC=1618208
## target ~ receipt_gas_used + totalfee + past_min + past_max +
## past_median + eth_high + Eur_night
##
## Df Sum of Sq RSS AIC
## - past_max 1 677 782862144 1618207
## <none> 782861468 1618208
## - past_min 1 15197 782876664 1618210
## - Eur_night 1 117141 782978608 1618236
## - past_median 1 1752926 784614393 1618643
## - receipt_gas_used 1 64808861 847670329 1633717
## - eth_high 1 115526317 898387785 1645050
## - totalfee 1 131183615 914045083 1648420
##
## Step: AIC=1618207
## target ~ receipt_gas_used + totalfee + past_min + past_median +
## eth_high + Eur_night
##
## Df Sum of Sq RSS AIC
## <none> 782862144 1618207
## - past_min 1 15598 782877742 1618208
## - Eur_night 1 117231 782979376 1618234
## - past_median 1 2150246 785012390 1618739
## - receipt_gas_used 1 64809554 847671698 1633716
## - eth_high 1 116514317 899376461 1645262
## - totalfee 1 131182990 914045134 1648418
##
## Call:
## lm(formula = target ~ receipt_gas_used + totalfee + past_min +
## past_median + eth_high + Eur_night, data = sample_scale5)
##
## Coefficients:
## (Intercept) receipt_gas_used totalfee past_min
## 96.1060 -24.4776 122.3316 0.7004
## past_median eth_high Eur_night1
## 8.5324 31.8448 1.5763
Without the large data, past_min matters.
Looking at forward model
basemodel2 <- lm(target~1,data=sample_scale5)
step(basemodel2, scope=list(upper=model5,lower=~1),
direction="forward", k=2)
## Start: AIC=1719387
## target ~ 1
##
## Df Sum of Sq RSS AIC
## + eth_high 1 387068043 928257660 1651419
## + past_median 1 229525185 1085800517 1681990
## + past_min 1 200722764 1114602938 1687096
## + totalfee 1 136617127 1178708575 1698002
## + past_max 1 104790262 1210535440 1703198
## + receipt_gas_used 1 1121792 1314203910 1719222
## + Eur_night 1 1047792 1314277910 1719233
## <none> 1315325702 1719387
##
## Step: AIC=1651419
## target ~ eth_high
##
## Df Sum of Sq RSS AIC
## + totalfee 1 68491243 859766416 1636473
## + past_median 1 13728466 914529194 1648515
## + past_min 1 10879594 917378065 1649121
## + past_max 1 6082521 922175138 1650138
## + receipt_gas_used 1 299067 927958593 1651358
## + Eur_night 1 287690 927969970 1651360
## <none> 928257660 1651419
##
## Step: AIC=1636473
## target ~ eth_high + totalfee
##
## Df Sum of Sq RSS AIC
## + receipt_gas_used 1 66774158 792992258 1620708
## + past_median 1 11900391 847866026 1633756
## + past_min 1 9388195 850378222 1634333
## + past_max 1 5173823 854592593 1635298
## + Eur_night 1 270018 859496399 1636413
## <none> 859766416 1636473
##
## Step: AIC=1620708
## target ~ eth_high + totalfee + receipt_gas_used
##
## Df Sum of Sq RSS AIC
## + past_median 1 9997250 782995008 1618236
## + past_min 1 7847310 785144948 1618770
## + past_max 1 4370946 788621312 1619632
## + Eur_night 1 191814 792800445 1620663
## <none> 792992258 1620708
##
## Step: AIC=1618236
## target ~ eth_high + totalfee + receipt_gas_used + past_median
##
## Df Sum of Sq RSS AIC
## + Eur_night 1 117267 782877742 1618208
## + past_min 1 15633 782979376 1618234
## <none> 782995008 1618236
## + past_max 1 1191 782993817 1618237
##
## Step: AIC=1618208
## target ~ eth_high + totalfee + receipt_gas_used + past_median +
## Eur_night
##
## Df Sum of Sq RSS AIC
## + past_min 1 15597.5 782862144 1618207
## <none> 782877742 1618208
## + past_max 1 1077.5 782876664 1618210
##
## Step: AIC=1618207
## target ~ eth_high + totalfee + receipt_gas_used + past_median +
## Eur_night + past_min
##
## Df Sum of Sq RSS AIC
## <none> 782862144 1618207
## + past_max 1 676.68 782861468 1618208
##
## Call:
## lm(formula = target ~ eth_high + totalfee + receipt_gas_used +
## past_median + Eur_night + past_min, data = sample_scale5)
##
## Coefficients:
## (Intercept) eth_high totalfee receipt_gas_used
## 96.1060 31.8448 122.3316 -24.4776
## past_median Eur_night1 past_min
## 8.5324 1.5763 0.7004
Both models are identical - refit model (eth_high, total_fee, recept, past media, eurnight, past mind)
sample_scale6 <- sample_scale5[,-4]
model6 <- lm(target~.,sample_scale6 )
summary(model6)
##
## Call:
## lm(formula = target ~ ., data = sample_scale6)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5633.4 -31.1 -10.3 17.3 875.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 96.1060 0.2237 429.706 < 2e-16 ***
## receipt_gas_used -24.4776 0.1926 -127.060 < 2e-16 ***
## totalfee 122.3316 0.6767 180.770 < 2e-16 ***
## past_min 0.7004 0.3554 1.971 0.0487 *
## past_median 8.5324 0.3687 23.144 < 2e-16 ***
## eth_high 31.8448 0.1869 170.364 < 2e-16 ***
## Eur_night1 1.5763 0.2917 5.404 6.53e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 63.36 on 195012 degrees of freedom
## Multiple R-squared: 0.4048, Adjusted R-squared: 0.4048
## F-statistic: 2.211e+04 on 6 and 195012 DF, p-value: < 2.2e-16
plot(model6)
boxcox(model6)
DFBetas Gas Amount
n2 = length(sample_scale6$target)
plot(dfbetas(model6)[,1], xlab="index", ylab="dfbeta_1",
pch=16, main="Gas amount",
cex=1.5, cex.main=1.5, cex.lab=1.5, cex.axis=1.5)
abline(h=2/sqrt(n2), col="red",lwd=3, lty=2)
abline(h=-2/sqrt(n2), col="red",lwd=3, lty=2)
b1 <- which(dfbetas(model6)[,1] > (2/sqrt(n2)) | dfbetas(model6)[,1] < (-2/sqrt(n2)) )
length(sample_scale6[b1, ]$target)
## [1] 7713
summary(sample[b1, ])
## time_stamp block_number hash gas_price
## Length:7713 Min. :10966884 Min. :1.651e+73 Min. : 0.0
## Class :character 1st Qu.:11574703 1st Qu.:2.954e+76 1st Qu.: 74.1
## Mode :character Median :11747768 Median :5.794e+76 Median : 128.0
## Mean :11667058 Mean :5.797e+76 Mean : 160.6
## 3rd Qu.:11834203 3rd Qu.:8.676e+76 3rd Qu.: 193.0
## Max. :11941848 Max. :1.158e+77 Max. :21621.0
## receipt_gas_used totalfee past_min past_max
## Min. : 13013 Min. :0.000e+00 Min. : 1.195 Min. : 24.08
## 1st Qu.: 21000 1st Qu.:2.493e+06 1st Qu.: 49.828 1st Qu.: 110.35
## Median : 36170 Median :4.762e+06 Median : 83.578 Median : 172.20
## Mean : 65899 Mean :9.869e+06 Mean : 96.144 Mean : 200.87
## 3rd Qu.: 57369 3rd Qu.:1.015e+07 3rd Qu.:125.050 3rd Qu.: 245.97
## Max. :8655474 Max. :1.104e+09 Max. :763.853 Max. :5527.15
## past_median blck_med blck_min blck_max
## Min. : 17.60 Min. : 1.00 Min. : 0.00 Min. : 20.0
## 1st Qu.: 73.82 1st Qu.: 64.64 1st Qu.: 16.00 1st Qu.: 206.4
## Median : 124.36 Median : 115.00 Median : 69.00 Median : 324.4
## Mean : 140.29 Mean : 131.94 Mean : 85.48 Mean : 683.8
## 3rd Qu.: 179.29 3rd Qu.: 172.70 3rd Qu.:128.00 3rd Qu.: 550.0
## Max. :1249.32 Max. :2529.50 Max. :845.00 Max. :210000.0
## blck_totalfee blck_count Eur_night eth_high
## Min. :1.633e+07 Min. : 11.0 Min. :0.0000 Min. : 342.6
## 1st Qu.:9.254e+08 1st Qu.:167.0 1st Qu.:0.0000 1st Qu.: 754.3
## Median :1.551e+09 Median :202.0 Median :1.0000 Median :1382.7
## Mean :1.787e+09 Mean :210.5 Mean :0.5684 Mean :1322.6
## 3rd Qu.:2.283e+09 3rd Qu.:241.0 3rd Qu.:1.0000 3rd Qu.:1781.4
## Max. :3.353e+10 Max. :595.0 Max. :1.0000 Max. :2036.3
Closer look
plot(dfbetas(model6)[,1], xlab="index", ylab="dfbeta_1",
pch=16, main="Gas amount",
cex=1.5, cex.main=1.5, cex.lab=1.5, cex.axis=1.5, ylim = c(-.1,.1))
abline(h=2/sqrt(n2), col="red",lwd=3, lty=2)
abline(h=-2/sqrt(n2), col="red",lwd=3, lty=2)
DFBetas Total Fee
plot(dfbetas(model6)[,2], xlab="index", ylab="dfbeta_2",
pch=16, main="total fee",
cex=1.5, cex.main=1.5, cex.lab=1.5, cex.axis=1.5)
abline(h=2/sqrt(n2), col="red",lwd=3, lty=2)
abline(h=-2/sqrt(n2), col="red",lwd=3, lty=2)
b2 <- which(dfbetas(model6)[,2] > (2/sqrt(n2)) | dfbetas(model6)[,2] < (-2/sqrt(n2)) )
length(sample_scale6[b2, ]$target)
## [1] 4165
summary(sample[b2, ])
## time_stamp block_number hash gas_price
## Length:4165 Min. :10967706 Min. :1.651e+73 Min. : 1.0
## Class :character 1st Qu.:11332601 1st Qu.:3.053e+76 1st Qu.: 57.0
## Mode :character Median :11607413 Median :5.896e+76 Median : 111.0
## Mean :11557751 Mean :5.817e+76 Mean : 150.1
## 3rd Qu.:11807857 3rd Qu.:8.666e+76 3rd Qu.: 182.0
## Max. :11943217 Max. :1.157e+77 Max. :21621.0
## receipt_gas_used totalfee past_min past_max
## Min. : 13013 Min. : 21000 Min. : 1.195 Min. : 23.46
## 1st Qu.: 21000 1st Qu.: 2095760 1st Qu.: 40.703 1st Qu.: 92.53
## Median : 36712 Median : 4181077 Median : 68.760 Median : 153.20
## Mean : 63157 Mean : 8797086 Mean : 85.191 Mean : 185.88
## 3rd Qu.: 57369 3rd Qu.: 9171255 3rd Qu.:110.422 3rd Qu.: 224.43
## Max. :2716773 Max. :454040871 Max. :745.720 Max. :2792.06
## past_median blck_med blck_min blck_max
## Min. : 15.48 Min. : 1.0 Min. : 0.00 Min. : 18.7
## 1st Qu.: 61.48 1st Qu.: 52.0 1st Qu.: 16.10 1st Qu.: 200.0
## Median : 104.94 Median : 93.0 Median : 55.20 Median : 303.2
## Mean : 125.21 Mean : 115.8 Mean : 75.58 Mean : 637.3
## 3rd Qu.: 157.68 3rd Qu.: 150.0 3rd Qu.:110.00 3rd Qu.: 510.1
## Max. :1472.47 Max. :1918.2 Max. :845.00 Max. :32858.3
## blck_totalfee blck_count Eur_night eth_high
## Min. :1.725e+07 Min. : 20.0 Min. :0.000 Min. : 342.6
## 1st Qu.:7.556e+08 1st Qu.:167.0 1st Qu.:0.000 1st Qu.: 579.4
## Median :1.302e+09 Median :202.0 Median :1.000 Median :1209.4
## Mean :1.601e+09 Mean :208.6 Mean :0.606 Mean :1120.2
## 3rd Qu.:2.023e+09 3rd Qu.:238.0 3rd Qu.:1.000 3rd Qu.:1690.0
## Max. :3.143e+10 Max. :594.0 Max. :1.000 Max. :2036.3
Closer look
plot(dfbetas(model6)[,2], xlab="index", ylab="dfbeta_1",
pch=16, main="Total Fee",
cex=1.5, cex.main=1.5, cex.lab=1.5, cex.axis=1.5, ylim = c(-1,3))
abline(h=2/sqrt(n2), col="red",lwd=3, lty=2)
abline(h=-2/sqrt(n2), col="red",lwd=3, lty=2)
DFBetas Past Min
plot(dfbetas(model6)[,3], xlab="index", ylab="dfbeta_2",
pch=16, main="Past Min",
cex=1.5, cex.main=1.5, cex.lab=1.5, cex.axis=1.5)
abline(h=2/sqrt(n2), col="red",lwd=3, lty=2)
abline(h=-2/sqrt(n2), col="red",lwd=3, lty=2)
b3 <- which(dfbetas(model6)[,3] > (2/sqrt(n2)) | dfbetas(model6)[,3] < (-2/sqrt(n2)) )
length(sample_scale6[b3, ]$target)
## [1] 5443
summary(sample[b3, ])
## time_stamp block_number hash gas_price
## Length:5443 Min. :10966888 Min. :1.651e+73 Min. : 1.00
## Class :character 1st Qu.:11403401 1st Qu.:3.032e+76 1st Qu.: 64.06
## Mode :character Median :11632202 Median :5.899e+76 Median : 125.00
## Mean :11594900 Mean :5.829e+76 Mean : 157.42
## 3rd Qu.:11827010 3rd Qu.:8.725e+76 3rd Qu.: 197.00
## Max. :11943217 Max. :1.158e+77 Max. :21620.99
## receipt_gas_used totalfee past_min past_max
## Min. : 13141 Min. : 21000 Min. : 1.194 Min. : 23.82
## 1st Qu.: 21000 1st Qu.: 2349268 1st Qu.: 44.244 1st Qu.: 101.15
## Median : 36824 Median : 4545450 Median : 77.542 Median : 166.41
## Mean : 62907 Mean : 9412905 Mean : 92.528 Mean : 196.57
## 3rd Qu.: 57369 3rd Qu.: 10068764 3rd Qu.:121.866 3rd Qu.: 240.12
## Max. :2716773 Max. :454040871 Max. :763.853 Max. :5527.15
## past_median blck_med blck_min blck_max
## Min. : 17.59 Min. : 1.00 Min. : 0.00 Min. : 18.7
## 1st Qu.: 66.88 1st Qu.: 56.64 1st Qu.: 17.10 1st Qu.: 207.9
## Median : 118.29 Median : 106.00 Median : 63.00 Median : 326.2
## Mean : 135.16 Mean : 125.95 Mean : 83.31 Mean : 669.4
## 3rd Qu.: 172.66 3rd Qu.: 166.30 3rd Qu.:123.75 3rd Qu.: 547.5
## Max. :1472.47 Max. :1918.24 Max. :845.00 Max. :32858.3
## blck_totalfee blck_count Eur_night eth_high
## Min. :1.633e+07 Min. : 25.0 Min. :0.000 Min. : 342.6
## 1st Qu.:8.352e+08 1st Qu.:168.0 1st Qu.:0.000 1st Qu.: 610.0
## Median :1.462e+09 Median :203.0 Median :1.000 Median :1282.6
## Mean :1.722e+09 Mean :210.0 Mean :0.605 Mean :1199.8
## 3rd Qu.:2.212e+09 3rd Qu.:239.5 3rd Qu.:1.000 3rd Qu.:1770.6
## Max. :3.143e+10 Max. :594.0 Max. :1.000 Max. :2036.3
plot(dfbetas(model6)[,3], xlab="index", ylab="dfbeta_3",
pch=16, main="Past Min",
cex=1.5, cex.main=1.5, cex.lab=1.5, cex.axis=1.5, ylim = c(-.1, .1))
abline(h=2/sqrt(n2), col="red",lwd=3, lty=2)
abline(h=-2/sqrt(n2), col="red",lwd=3, lty=2)
DFFits
k3 = 6
plot(dffits(model6), xlab="index", ylab="dffits",
pch=16, main="DFFITS ",
cex=1.5, cex.main=2, cex.lab=1.5, cex.axis=1.5)
abline(h=2*sqrt((k3+1)/n2), col="red",lwd=3, lty=2)
abline(h=-2*sqrt((k3+1)/n2), col="red",lwd=3, lty=2)
yinfluence <- which(dffits(model6) > (2*sqrt((k3+1)/n2)) |dffits(model6) < (-2*sqrt((k3+1)/n2)) )
length(sample_scale6[yinfluence, ]$target)
## [1] 9798
summary(sample[yinfluence, ])
## time_stamp block_number hash gas_price
## Length:9798 Min. :10967706 Min. :1.651e+73 Min. : 1.0
## Class :character 1st Qu.:11585838 1st Qu.:2.844e+76 1st Qu.: 91.0
## Mode :character Median :11794648 Median :5.769e+76 Median : 146.0
## Mean :11687803 Mean :5.752e+76 Mean : 178.6
## 3rd Qu.:11872034 3rd Qu.:8.675e+76 3rd Qu.: 211.0
## Max. :11943217 Max. :1.158e+77 Max. :21621.0
## receipt_gas_used totalfee past_min past_max
## Min. : 13013 Min. :2.100e+04 Min. : 1.023 Min. : 23.82
## 1st Qu.: 21000 1st Qu.:2.884e+06 1st Qu.: 57.965 1st Qu.: 136.58
## Median : 36294 Median :5.158e+06 Median : 97.925 Median : 194.45
## Mean : 63778 Mean :1.090e+07 Mean :108.367 Mean : 225.12
## 3rd Qu.: 57381 3rd Qu.:1.156e+07 3rd Qu.:138.835 3rd Qu.: 265.64
## Max. :5460163 Max. :1.104e+09 Max. :895.098 Max. :5527.15
## past_median blck_med blck_min blck_max
## Min. : 17.85 Min. : 1.0 Min. : 0.00 Min. : 18.7
## 1st Qu.: 93.95 1st Qu.: 80.0 1st Qu.: 16.67 1st Qu.: 234.0
## Median : 144.40 Median : 133.2 Median : 90.00 Median : 360.0
## Mean : 159.83 Mean : 149.8 Mean : 97.83 Mean : 767.3
## 3rd Qu.: 196.65 3rd Qu.: 191.3 3rd Qu.: 145.20 3rd Qu.: 594.5
## Max. :1687.49 Max. :2529.5 Max. :1400.45 Max. :210000.0
## blck_totalfee blck_count Eur_night eth_high
## Min. :1.633e+07 Min. : 25.0 Min. :0.0000 Min. : 342.6
## 1st Qu.:1.159e+09 1st Qu.:171.0 1st Qu.:0.0000 1st Qu.:1134.3
## Median :1.792e+09 Median :206.0 Median :1.0000 Median :1689.2
## Mean :2.040e+09 Mean :214.2 Mean :0.6042 Mean :1416.1
## 3rd Qu.:2.530e+09 3rd Qu.:244.0 3rd Qu.:1.0000 3rd Qu.:1833.8
## Max. :3.353e+10 Max. :594.0 Max. :1.0000 Max. :2036.3
Removing everything greater than 350 and 0
outliers3 <- which(sample_scale2$target > 350 | sample_scale2$target == 0)
sample_scale7 <- sample_scale2[-outliers3,]
model7 <- lm(target~.,sample_scale7)
summary(model7)
##
## Call:
## lm(formula = target ~ ., data = sample_scale7)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4259.6 -28.0 -8.0 20.1 633.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 90.7715 0.1799 504.579 < 2e-16 ***
## receipt_gas_used -18.1512 0.1607 -112.961 < 2e-16 ***
## totalfee 92.8845 0.6114 151.927 < 2e-16 ***
## past_min 1.7774 0.2909 6.110 9.99e-10 ***
## past_max -0.3080 0.1706 -1.806 0.071 .
## past_median 6.6169 0.3357 19.713 < 2e-16 ***
## eth_high 30.1773 0.1520 198.555 < 2e-16 ***
## Eur_night1 1.5414 0.2343 6.578 4.77e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 50.45 on 191577 degrees of freedom
## Multiple R-squared: 0.4322, Adjusted R-squared: 0.4322
## F-statistic: 2.083e+04 on 7 and 191577 DF, p-value: < 2.2e-16
plot(model7)
step(model7, direction = "backward", k = 2)
## Start: AIC=1502422
## target ~ receipt_gas_used + totalfee + past_min + past_max +
## past_median + eth_high + Eur_night
##
## Df Sum of Sq RSS AIC
## <none> 487631610 1502422
## - past_max 1 8299 487639909 1502424
## - past_min 1 95021 487726631 1502458
## - Eur_night 1 110147 487741757 1502464
## - past_median 1 989162 488620772 1502809
## - receipt_gas_used 1 32479075 520110684 1514774
## - totalfee 1 58751470 546383079 1524215
## - eth_high 1 100347856 587979465 1538272
##
## Call:
## lm(formula = target ~ receipt_gas_used + totalfee + past_min +
## past_max + past_median + eth_high + Eur_night, data = sample_scale7)
##
## Coefficients:
## (Intercept) receipt_gas_used totalfee past_min
## 90.771 -18.151 92.885 1.777
## past_max past_median eth_high Eur_night1
## -0.308 6.617 30.177 1.541