Getting rid index and unnamed columns imported from python

sample <- read.csv('Sample_Data.csv')
sample <- sample[,-1]
sample <- sample[,-17]
sample <- na.omit(sample)

scaling data (time, block number, hash, and category eur- night should not be used in scaling, nor should the target gas price)

drop_columns <- c(1, 2, 3, 4, 15)
sample_scale <- sample[,-drop_columns]
sample_scale <- scale(sample_scale)

Adding back in eur-night and changing it categorical, and adding back in the target gas

sample_scale <- as.data.frame(sample_scale)
sample_scale$Eur_night <- as.factor(sample$Eur_night)
sample_scale$target <- sample$gas_price

Full regression analysis on gas price as response

model_data <- lm(target~., sample_scale)
summary(model_data)
## 
## Call:
## lm(formula = target ~ ., data = sample_scale)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -12498    -37    -15     13  32414 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      101.8360     0.6454 157.796  < 2e-16 ***
## receipt_gas_used -24.9412     0.4227 -59.001  < 2e-16 ***
## totalfee         118.9964     0.4231 281.271  < 2e-16 ***
## past_min           0.2896     1.0270   0.282 0.777938    
## past_max          -0.2992     0.6142  -0.487 0.626117    
## past_median        7.6473     1.4800   5.167 2.38e-07 ***
## blck_med           0.1117     1.2334   0.091 0.927837    
## blck_min          -0.3582     0.6235  -0.574 0.565668    
## blck_max           1.7639     0.5288   3.336 0.000851 ***
## blck_totalfee      2.5774     1.1555   2.231 0.025710 *  
## blck_count         0.9412     0.4275   2.202 0.027695 *  
## eth_high          33.8273     0.5362  63.083  < 2e-16 ***
## Eur_night1         1.4770     0.8428   1.752 0.079699 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 183.2 on 196156 degrees of freedom
## Multiple R-squared:  0.3208, Adjusted R-squared:  0.3208 
## F-statistic:  7722 on 12 and 196156 DF,  p-value: < 2.2e-16

Getting rid of na data and uncessary columns

drop_columns2 <- c(6:10)
sample_scale2 <- sample_scale[,-drop_columns2]
model2 <- lm(target~., sample_scale2)
summary(model2)
## 
## Call:
## lm(formula = target ~ ., data = sample_scale2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -12508    -37    -15     13  32413 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      101.76256    0.64467 157.853   <2e-16 ***
## receipt_gas_used -24.97888    0.42262 -59.104   <2e-16 ***
## totalfee         119.02510    0.42308 281.329   <2e-16 ***
## past_min           0.27580    1.02306   0.270   0.7875    
## past_max          -0.09118    0.61170  -0.149   0.8815    
## past_median        9.82554    1.18567   8.287   <2e-16 ***
## eth_high          33.84553    0.53182  63.641   <2e-16 ***
## Eur_night1         1.60197    0.84123   1.904   0.0569 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 183.3 on 196161 degrees of freedom
## Multiple R-squared:  0.3207, Adjusted R-squared:  0.3206 
## F-statistic: 1.323e+04 on 7 and 196161 DF,  p-value: < 2.2e-16
qqnorm(rstudent(model2))
abline(0, 1, col = "red")

plot(model2$fitted.values, rstudent(model2), ylim = c(-5, 5))
abline(3,0, col = "red")
abline(-3, 0, col = "red")

looking at leverage

k = 7
n = length(sample_scale2$target)
plot(hatvalues(model2),rstudent(model2),
     xlab="hat values", ylab="studentized residuals", 
     pch=16, main="Residual vs leverage", 
     cex=1.5, cex.main=1.5, cex.lab=1.5, cex.axis=1.5)
abline(v=2*(k+1)/n, col="red",lwd=3, lty=2)
abline(h=-3, col="red",lwd=3, lty=2)
abline(h=3, col="red",lwd=3, lty=2)

leverage <- which(hatvalues(model2) > 2*(k+1)/n & (rstudent(model2) > 3 | rstudent(model2) < -3 ))
gas_levearge <- sample_scale2[leverage,]
gas_levearge
summary(gas_levearge$target)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    12.0   111.8  1298.9  4058.1  5075.1 34912.0

Just looking at outliers for residuals between 3 and -3

outliers <- which((rstudent(model2) > 3 | rstudent(model2) < -3 ))
gas_outliers <- sample_scale2[outliers,]
summary(gas_outliers)
##  receipt_gas_used      totalfee           past_min           past_max        
##  Min.   :-0.37196   Min.   :  0.0375   Min.   :-1.12499   Min.   :-0.879027  
##  1st Qu.:-0.31856   1st Qu.:  0.3082   1st Qu.:-0.51187   1st Qu.:-0.425772  
##  Median :-0.26557   Median :  0.6669   Median :-0.07212   Median :-0.007797  
##  Mean   : 2.52735   Mean   :  3.1128   Mean   : 0.21563   Mean   : 0.173870  
##  3rd Qu.: 0.04586   3rd Qu.:  1.8969   3rd Qu.: 0.66478   3rd Qu.: 0.451188  
##  Max.   :82.08100   Max.   :398.5944   Max.   :12.49627   Max.   :10.894746  
##   past_median          eth_high       Eur_night     target       
##  Min.   :-0.97367   Min.   :-1.1048   0:209     Min.   :   12.0  
##  1st Qu.:-0.50020   1st Qu.:-0.6685   1:318     1st Qu.:  806.2  
##  Median : 0.03345   Median : 0.3776             Median : 1039.4  
##  Mean   : 0.24611   Mean   : 0.3388             Mean   : 2034.7  
##  3rd Qu.: 0.73053   3rd Qu.: 1.4572             3rd Qu.: 1744.4  
##  Max.   :12.42837   Max.   : 2.0662             Max.   :34912.0
length(gas_outliers$target)
## [1] 527
hist(gas_outliers$target)

gas_outliers

Looking at these with original data

summary(sample[outliers,])
##   time_stamp         block_number           hash             gas_price      
##  Length:527         Min.   :10974925   Min.   :5.786e+73   Min.   :   12.0  
##  Class :character   1st Qu.:11338118   1st Qu.:2.789e+76   1st Qu.:  806.2  
##  Mode  :character   Median :11602472   Median :5.801e+76   Median : 1039.4  
##                     Mean   :11569641   Mean   :5.884e+76   Mean   : 2034.7  
##                     3rd Qu.:11849119   3rd Qu.:8.951e+76   3rd Qu.: 1744.4  
##                     Max.   :11947143   Max.   :1.157e+77   Max.   :34912.0  
##  receipt_gas_used      totalfee            past_min          past_max      
##  Min.   :   13116   Min.   :8.538e+06   Min.   :  3.299   Min.   :  27.44  
##  1st Qu.:   21000   1st Qu.:2.212e+07   1st Qu.: 38.638   1st Qu.:  90.82  
##  Median :   28823   Median :4.011e+07   Median : 63.985   Median : 149.26  
##  Mean   :  441183   Mean   :1.628e+08   Mean   : 80.570   Mean   : 174.66  
##  3rd Qu.:   74804   3rd Qu.:1.018e+08   3rd Qu.:106.459   3rd Qu.: 213.44  
##  Max.   :12186847   Max.   :2.000e+10   Max.   :788.409   Max.   :1673.71  
##   past_median         blck_med         blck_min         blck_max      
##  Min.   :  20.17   Min.   :   1.0   Min.   :  0.00   Min.   :   31.0  
##  1st Qu.:  58.31   1st Qu.:  50.0   1st Qu.: 15.05   1st Qu.:  212.6  
##  Median : 101.30   Median :  92.3   Median : 52.80   Median :  355.0  
##  Mean   : 118.43   Mean   : 112.9   Mean   : 72.99   Mean   :  949.5  
##  3rd Qu.: 157.45   3rd Qu.: 150.5   3rd Qu.:115.00   3rd Qu.:  700.0  
##  Max.   :1099.74   Max.   :1437.5   Max.   :700.00   Max.   :30000.0  
##  blck_totalfee         blck_count      Eur_night         eth_high     
##  Min.   :5.935e+07   Min.   :  8.0   Min.   :0.0000   Min.   : 342.6  
##  1st Qu.:7.732e+08   1st Qu.:166.0   1st Qu.:0.0000   1st Qu.: 575.6  
##  Median :1.337e+09   Median :204.0   Median :1.0000   Median :1134.3  
##  Mean   :1.648e+09   Mean   :209.4   Mean   :0.6034   Mean   :1113.6  
##  3rd Qu.:2.046e+09   3rd Qu.:241.5   3rd Qu.:1.0000   3rd Qu.:1711.0  
##  Max.   :3.353e+10   Max.   :582.0   Max.   :1.0000   Max.   :2036.3
summary(sample)
##   time_stamp         block_number           hash             gas_price       
##  Length:196169      Min.   :10966876   Min.   :4.411e+71   Min.   :    0.00  
##  Class :character   1st Qu.:11244657   1st Qu.:2.885e+76   1st Qu.:   40.96  
##  Mode  :character   Median :11490543   Median :5.799e+76   Median :   72.00  
##                     Mean   :11481572   Mean   :5.789e+76   Mean   :  102.70  
##                     3rd Qu.:11725084   3rd Qu.:8.688e+76   3rd Qu.:  126.00  
##                     Max.   :11948958   Max.   :1.158e+77   Max.   :34912.03  
##  receipt_gas_used      totalfee            past_min          past_max       
##  Min.   :   13013   Min.   :0.000e+00   Min.   :   0.00   Min.   :   18.05  
##  1st Qu.:   21000   1st Qu.:1.428e+06   1st Qu.:  31.07   1st Qu.:   73.66  
##  Median :   37793   Median :2.966e+06   Median :  51.80   Median :  115.72  
##  Mean   :   68033   Mean   :6.654e+06   Mean   :  68.14   Mean   :  150.35  
##  3rd Qu.:   58655   3rd Qu.:6.453e+06   3rd Qu.:  87.54   3rd Qu.:  188.32  
##  Max.   :12186847   Max.   :2.000e+10   Max.   :1330.40   Max.   :14309.59  
##   past_median         blck_med          blck_min          blck_max       
##  Min.   :  13.00   Min.   :   0.00   Min.   :   0.00   Min.   :     1.0  
##  1st Qu.:  47.58   1st Qu.:  40.00   1st Qu.:  16.00   1st Qu.:   166.7  
##  Median :  74.13   Median :  68.00   Median :  42.14   Median :   260.0  
##  Mean   :  98.60   Mean   :  91.78   Mean   :  60.58   Mean   :   618.1  
##  3rd Qu.: 126.22   3rd Qu.: 118.00   3rd Qu.:  84.00   3rd Qu.:   500.0  
##  Max.   :1823.15   Max.   :3888.00   Max.   :1400.45   Max.   :210000.0  
##  blck_totalfee         blck_count      Eur_night         eth_high     
##  Min.   :5.630e+06   Min.   :  5.0   Min.   :0.0000   Min.   : 342.6  
##  1st Qu.:5.934e+08   1st Qu.:163.0   1st Qu.:0.0000   1st Qu.: 465.7  
##  Median :9.653e+08   Median :196.0   Median :1.0000   Median : 637.1  
##  Mean   :1.280e+09   Mean   :202.3   Mean   :0.5877   Mean   : 932.7  
##  3rd Qu.:1.624e+09   3rd Qu.:232.0   3rd Qu.:1.0000   3rd Qu.:1378.9  
##  Max.   :9.682e+10   Max.   :596.0   Max.   :1.0000   Max.   :2036.3

looking at prices below 800

below800 <- which(sample_scale2[outliers,]$target < 800)
summary(sample_scale2[below800,])
##  receipt_gas_used      totalfee           past_min          past_max      
##  Min.   :-0.32921   Min.   :-0.13221   Min.   :-0.9632   Min.   :-0.5498  
##  1st Qu.:-0.31856   1st Qu.:-0.09957   1st Qu.:-0.2879   1st Qu.:-0.3810  
##  Median :-0.20660   Median :-0.07216   Median :-0.1097   Median :-0.2349  
##  Mean   :-0.06736   Mean   :-0.04129   Mean   :-0.1285   Mean   : 0.1375  
##  3rd Qu.:-0.07818   3rd Qu.:-0.02342   3rd Qu.: 0.0487   3rd Qu.: 0.1378  
##  Max.   : 4.06131   Max.   : 0.68181   Max.   : 0.3290   Max.   : 6.1526  
##   past_median         eth_high      Eur_night     target      
##  Min.   :-0.5471   Min.   :-1.055   0:99      Min.   :  1.00  
##  1st Qu.:-0.2995   1st Qu.:-1.055   1:25      1st Qu.: 66.00  
##  Median :-0.2173   Median :-1.055             Median : 77.00  
##  Mean   :-0.2075   Mean   :-1.055             Mean   : 79.28  
##  3rd Qu.:-0.1319   3rd Qu.:-1.055             3rd Qu.: 84.50  
##  Max.   : 0.3336   Max.   :-1.055             Max.   :400.00

Regular data

sample[below800,]

Looking at prices above 800

above800 <- which(sample_scale2[outliers,]$target > 800)
summary(sample_scale2[above800,])
##  receipt_gas_used      totalfee            past_min           past_max       
##  Min.   :-0.31958   Min.   :-0.132214   Min.   :-1.04704   Min.   :-0.59702  
##  1st Qu.:-0.31856   1st Qu.:-0.098729   1st Qu.:-0.26867   1st Qu.:-0.40471  
##  Median :-0.18176   Median :-0.069662   Median :-0.12614   Median :-0.26463  
##  Mean   : 0.05829   Mean   :-0.007382   Mean   :-0.13227   Mean   : 0.04212  
##  3rd Qu.: 0.08070   3rd Qu.:-0.000528   3rd Qu.: 0.05198   3rd Qu.: 0.16473  
##  Max.   :17.01453   Max.   : 4.238675   Max.   : 0.50680   Max.   : 6.15262  
##   past_median         eth_high      Eur_night     target      
##  Min.   :-0.5546   Min.   :-1.055   0:230     Min.   :  1.00  
##  1st Qu.:-0.3032   1st Qu.:-1.055   1:167     1st Qu.: 67.00  
##  Median :-0.2076   Median :-1.055             Median : 77.00  
##  Mean   :-0.1918   Mean   :-1.055             Mean   : 80.73  
##  3rd Qu.:-0.1148   3rd Qu.:-1.055             3rd Qu.: 86.90  
##  Max.   : 0.5170   Max.   :-1.055             Max.   :390.00

Refitting with everything above 800 gone, and everything that is 0 gwei

outliers2 <- which(sample_scale2$target > 800 | sample_scale2$target == 0)
sample_scale3 <- sample_scale2[-outliers2,]
model3 <- lm(target~.,sample_scale3)
summary(model3)
## 
## Call:
## lm(formula = target ~ ., data = sample_scale3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6364.2   -32.0   -10.9    16.1   993.0 
## 
## Coefficients:
##                  Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)       97.8664     0.2426  403.404  < 2e-16 ***
## receipt_gas_used -27.6398     0.2059 -134.232  < 2e-16 ***
## totalfee         137.8079     0.7032  195.978  < 2e-16 ***
## past_min           0.7736     0.3851    2.009   0.0446 *  
## past_max          -0.2865     0.2305   -1.243   0.2138    
## past_median        8.9014     0.4462   19.950  < 2e-16 ***
## eth_high          32.0375     0.2033  157.604  < 2e-16 ***
## Eur_night1         1.4574     0.3165    4.605 4.13e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 68.86 on 195694 degrees of freedom
## Multiple R-squared:  0.398,  Adjusted R-squared:  0.398 
## F-statistic: 1.848e+04 on 7 and 195694 DF,  p-value: < 2.2e-16
plot(model3)

Refit using step using model with outliers

library(MASS)
step(model2, direction = "backward", k = 2)
## Start:  AIC=2044436
## target ~ receipt_gas_used + totalfee + past_min + past_max + 
##     past_median + eth_high + Eur_night
## 
##                    Df  Sum of Sq        RSS     AIC
## - past_max          1        746 6587641090 2044434
## - past_min          1       2441 6587642785 2044434
## <none>                           6587640344 2044436
## - Eur_night         1     121787 6587762131 2044438
## - past_median       1    2306217 6589946561 2044503
## - receipt_gas_used  1  117315057 6704955401 2047897
## - eth_high          1  136015812 6723656156 2048443
## - totalfee          1 2657937495 9245577839 2110926
## 
## Step:  AIC=2044434
## target ~ receipt_gas_used + totalfee + past_min + past_median + 
##     eth_high + Eur_night
## 
##                    Df  Sum of Sq        RSS     AIC
## - past_min          1       2605 6587643695 2044432
## <none>                           6587641090 2044434
## - Eur_night         1     121889 6587762979 2044436
## - past_median       1    2840489 6590481579 2044517
## - receipt_gas_used  1  117317916 6704959006 2047895
## - eth_high          1  137236599 6724877689 2048477
## - totalfee          1 2657943379 9245584469 2110924
## 
## Step:  AIC=2044432
## target ~ receipt_gas_used + totalfee + past_median + eth_high + 
##     Eur_night
## 
##                    Df  Sum of Sq        RSS     AIC
## <none>                           6587643695 2044432
## - Eur_night         1     121899 6587765594 2044434
## - past_median       1   12013659 6599657354 2044788
## - receipt_gas_used  1  117318326 6704962021 2047893
## - eth_high          1  137587967 6725231662 2048485
## - totalfee          1 2657942694 9245586389 2110922
## 
## Call:
## lm(formula = target ~ receipt_gas_used + totalfee + past_median + 
##     eth_high + Eur_night, data = sample_scale2)
## 
## Coefficients:
##      (Intercept)  receipt_gas_used          totalfee       past_median  
##          101.762           -24.979           119.025            10.002  
##         eth_high        Eur_night1  
##           33.860             1.603
basemodel <- lm(target~1,data=sample_scale2)
step(basemodel, scope=list(upper=model2,lower=~1),
     direction="forward", k=2)
## Start:  AIC=2120269
## target ~ 1
## 
##                    Df  Sum of Sq        RSS     AIC
## + totalfee          1 2650171422 7046992553 2057647
## + eth_high          1  434062350 9263101625 2111287
## + past_median       1  262159900 9435004075 2114894
## + past_min          1  227312505 9469851471 2115617
## + past_max          1  121624046 9575539930 2117795
## + Eur_night         1    1480981 9695682994 2120241
## + receipt_gas_used  1     998076 9696165899 2120250
## <none>                           9697163975 2120269
## 
## Step:  AIC=2057647
## target ~ totalfee
## 
##                    Df Sum of Sq        RSS     AIC
## + eth_high          1 329141901 6717850652 2048266
## + past_median       1 197889271 6849103282 2052062
## + past_min          1 171743340 6875249213 2052809
## + receipt_gas_used  1 133084294 6913908259 2053909
## + past_max          1  89969894 6957022659 2055128
## + Eur_night         1    952756 7046039797 2057623
## <none>                          7046992553 2057647
## 
## Step:  AIC=2048266
## target ~ totalfee + eth_high
## 
##                    Df Sum of Sq        RSS     AIC
## + receipt_gas_used  1 117985277 6599865375 2044792
## + past_median       1  12712356 6705138296 2047896
## + past_min          1   9719683 6708130969 2047984
## + past_max          1   5499302 6712351350 2048107
## + Eur_night         1    280514 6717570138 2048260
## <none>                          6717850652 2048266
## 
## Step:  AIC=2044792
## target ~ totalfee + eth_high + receipt_gas_used
## 
##               Df Sum of Sq        RSS     AIC
## + past_median  1  12099781 6587765594 2044434
## + past_min     1   9243432 6590621943 2044519
## + past_max     1   5342310 6594523065 2044635
## + Eur_night    1    208021 6599657354 2044788
## <none>                     6599865375 2044792
## 
## Step:  AIC=2044434
## target ~ totalfee + eth_high + receipt_gas_used + past_median
## 
##             Df Sum of Sq        RSS     AIC
## + Eur_night  1    121899 6587643695 2044432
## <none>                   6587765594 2044434
## + past_min   1      2615 6587762979 2044436
## + past_max   1      1023 6587764571 2044436
## 
## Step:  AIC=2044432
## target ~ totalfee + eth_high + receipt_gas_used + past_median + 
##     Eur_night
## 
##            Df Sum of Sq        RSS     AIC
## <none>                  6587643695 2044432
## + past_min  1   2604.58 6587641090 2044434
## + past_max  1    909.99 6587642785 2044434
## 
## Call:
## lm(formula = target ~ totalfee + eth_high + receipt_gas_used + 
##     past_median + Eur_night, data = sample_scale2)
## 
## Coefficients:
##      (Intercept)          totalfee          eth_high  receipt_gas_used  
##          101.762           119.025            33.860           -24.979  
##      past_median        Eur_night1  
##           10.002             1.603

Both models are identical (totalfee, ethhigh, receipt_gas, past_median, eurnight )

drop_columns3 <- c(3,4)
sample_scale4 <- sample_scale2[,-drop_columns3]
model4 <- lm(target~., sample_scale4)
summary(model4)
## 
## Call:
## lm(formula = target ~ ., data = sample_scale4)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -12508    -37    -15     13  32413 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      101.7621     0.6447 157.854   <2e-16 ***
## receipt_gas_used -24.9791     0.4226 -59.105   <2e-16 ***
## totalfee         119.0250     0.4231 281.330   <2e-16 ***
## past_median       10.0020     0.5288  18.914   <2e-16 ***
## eth_high          33.8597     0.5290  64.008   <2e-16 ***
## Eur_night1         1.6027     0.8412   1.905   0.0568 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 183.3 on 196163 degrees of freedom
## Multiple R-squared:  0.3207, Adjusted R-squared:  0.3206 
## F-statistic: 1.852e+04 on 5 and 196163 DF,  p-value: < 2.2e-16
qqnorm(rstudent(model4))
abline(0, 1, col = "red")

plot(model4$fitted.values, rstudent(model4), ylim = c(-5, 5))
abline(3,0, col = "red")
abline(-3, 0, col = "red")

k2 = 5
n = length(sample_scale4$target)
plot(hatvalues(model4),rstudent(model4),
     xlab="hat values", ylab="studentized residuals", 
     pch=16, main="Residual vs leverage", 
     cex=1.5, cex.main=1.5, cex.lab=1.5, cex.axis=1.5)
abline(v=2*(k2+1)/n, col="red",lwd=3, lty=2)
abline(h=-3, col="red",lwd=3, lty=2)
abline(h=3, col="red",lwd=3, lty=2)

Looking at leverage

leverage2 <- which(hatvalues(model4) > 2*(k2+1)/n & (rstudent(model4) > 3 | rstudent(model4) < -3 ))
gas_levearge2 <- sample_scale4[leverage2,]
gas_levearge2

non-scaled values

sample[leverage2,]
summary(sample[leverage2,])
##   time_stamp         block_number           hash             gas_price      
##  Length:117         Min.   :10974925   Min.   :6.082e+73   Min.   :   12.0  
##  Class :character   1st Qu.:11253924   1st Qu.:2.550e+76   1st Qu.:  113.5  
##  Mode  :character   Median :11531989   Median :5.595e+76   Median : 1945.8  
##                     Mean   :11514722   Mean   :5.818e+76   Mean   : 4220.0  
##                     3rd Qu.:11798673   3rd Qu.:9.197e+76   3rd Qu.: 5555.0  
##                     Max.   :11947143   Max.   :1.157e+77   Max.   :34912.0  
##  receipt_gas_used      totalfee            past_min         past_max      
##  Min.   :   13247   Min.   :1.854e+07   Min.   : 11.65   Min.   :  27.44  
##  1st Qu.:   62494   1st Qu.:9.289e+07   1st Qu.: 30.29   1st Qu.:  79.31  
##  Median :  126331   Median :1.950e+08   Median : 61.92   Median : 145.83  
##  Mean   : 1859190   Mean   :5.722e+08   Mean   : 97.02   Mean   : 212.09  
##  3rd Qu.: 5110271   3rd Qu.:4.689e+08   3rd Qu.:127.45   3rd Qu.: 224.32  
##  Max.   :12186847   Max.   :2.000e+10   Max.   :788.41   Max.   :1403.99  
##   past_median         blck_med         blck_min         blck_max      
##  Min.   :  20.91   Min.   :   9.0   Min.   :  0.00   Min.   :   31.0  
##  1st Qu.:  48.79   1st Qu.:  42.0   1st Qu.: 15.00   1st Qu.:  200.0  
##  Median :  89.44   Median :  82.5   Median : 41.00   Median :  325.2  
##  Mean   : 141.69   Mean   : 137.4   Mean   : 79.17   Mean   : 1375.1  
##  3rd Qu.: 174.56   3rd Qu.: 159.3   3rd Qu.:116.00   3rd Qu.:  700.0  
##  Max.   :1099.74   Max.   :1437.5   Max.   :700.00   Max.   :30000.0  
##  blck_totalfee         blck_count    Eur_night         eth_high     
##  Min.   :1.940e+08   Min.   :  8   Min.   :0.0000   Min.   : 351.6  
##  1st Qu.:6.205e+08   1st Qu.:146   1st Qu.:0.0000   1st Qu.: 467.7  
##  Median :1.190e+09   Median :200   Median :1.0000   Median : 668.8  
##  Mean   :2.159e+09   Mean   :199   Mean   :0.5299   Mean   :1017.7  
##  3rd Qu.:2.196e+09   3rd Qu.:245   3rd Qu.:1.0000   3rd Qu.:1670.2  
##  Max.   :3.353e+10   Max.   :444   Max.   :1.0000   Max.   :2036.3

Looking at outliers

outliers2 <- which((rstudent(model4) > 3 | rstudent(model4) < -3 ))
gas_outliers2 <- sample_scale4[outliers2,]
summary(gas_outliers2)
##  receipt_gas_used      totalfee         past_median          eth_high      
##  Min.   :-0.37196   Min.   :  0.0375   Min.   :-0.97367   Min.   :-1.1048  
##  1st Qu.:-0.31856   1st Qu.:  0.3082   1st Qu.:-0.50020   1st Qu.:-0.6685  
##  Median :-0.26557   Median :  0.6669   Median : 0.03345   Median : 0.3776  
##  Mean   : 2.52735   Mean   :  3.1128   Mean   : 0.24611   Mean   : 0.3388  
##  3rd Qu.: 0.04586   3rd Qu.:  1.8969   3rd Qu.: 0.73053   3rd Qu.: 1.4572  
##  Max.   :82.08100   Max.   :398.5944   Max.   :12.42837   Max.   : 2.0662  
##  Eur_night     target       
##  0:209     Min.   :   12.0  
##  1:318     1st Qu.:  806.2  
##            Median : 1039.4  
##            Mean   : 2034.7  
##            3rd Qu.: 1744.4  
##            Max.   :34912.0
length(gas_outliers2$target)
## [1] 527
hist(gas_outliers2$target)

gas_outliers2

nonscaled data

sample[outliers2,]
summary(sample[outliers2,])
##   time_stamp         block_number           hash             gas_price      
##  Length:527         Min.   :10974925   Min.   :5.786e+73   Min.   :   12.0  
##  Class :character   1st Qu.:11338118   1st Qu.:2.789e+76   1st Qu.:  806.2  
##  Mode  :character   Median :11602472   Median :5.801e+76   Median : 1039.4  
##                     Mean   :11569641   Mean   :5.884e+76   Mean   : 2034.7  
##                     3rd Qu.:11849119   3rd Qu.:8.951e+76   3rd Qu.: 1744.4  
##                     Max.   :11947143   Max.   :1.157e+77   Max.   :34912.0  
##  receipt_gas_used      totalfee            past_min          past_max      
##  Min.   :   13116   Min.   :8.538e+06   Min.   :  3.299   Min.   :  27.44  
##  1st Qu.:   21000   1st Qu.:2.212e+07   1st Qu.: 38.638   1st Qu.:  90.82  
##  Median :   28823   Median :4.011e+07   Median : 63.985   Median : 149.26  
##  Mean   :  441183   Mean   :1.628e+08   Mean   : 80.570   Mean   : 174.66  
##  3rd Qu.:   74804   3rd Qu.:1.018e+08   3rd Qu.:106.459   3rd Qu.: 213.44  
##  Max.   :12186847   Max.   :2.000e+10   Max.   :788.409   Max.   :1673.71  
##   past_median         blck_med         blck_min         blck_max      
##  Min.   :  20.17   Min.   :   1.0   Min.   :  0.00   Min.   :   31.0  
##  1st Qu.:  58.31   1st Qu.:  50.0   1st Qu.: 15.05   1st Qu.:  212.6  
##  Median : 101.30   Median :  92.3   Median : 52.80   Median :  355.0  
##  Mean   : 118.43   Mean   : 112.9   Mean   : 72.99   Mean   :  949.5  
##  3rd Qu.: 157.45   3rd Qu.: 150.5   3rd Qu.:115.00   3rd Qu.:  700.0  
##  Max.   :1099.74   Max.   :1437.5   Max.   :700.00   Max.   :30000.0  
##  blck_totalfee         blck_count      Eur_night         eth_high     
##  Min.   :5.935e+07   Min.   :  8.0   Min.   :0.0000   Min.   : 342.6  
##  1st Qu.:7.732e+08   1st Qu.:166.0   1st Qu.:0.0000   1st Qu.: 575.6  
##  Median :1.337e+09   Median :204.0   Median :1.0000   Median :1134.3  
##  Mean   :1.648e+09   Mean   :209.4   Mean   :0.6034   Mean   :1113.6  
##  3rd Qu.:2.046e+09   3rd Qu.:241.5   3rd Qu.:1.0000   3rd Qu.:1711.0  
##  Max.   :3.353e+10   Max.   :582.0   Max.   :1.0000   Max.   :2036.3

First quartile gas is the same for outliers

What gas price would look like without outliers

summary(sample[-outliers2,])
##   time_stamp         block_number           hash             gas_price     
##  Length:195642      Min.   :10966876   Min.   :4.411e+71   Min.   :   0.0  
##  Class :character   1st Qu.:11244466   1st Qu.:2.885e+76   1st Qu.:  40.7  
##  Mode  :character   Median :11490178   Median :5.799e+76   Median :  72.0  
##                     Mean   :11481334   Mean   :5.789e+76   Mean   :  97.5  
##                     3rd Qu.:11724876   3rd Qu.:8.688e+76   3rd Qu.: 125.0  
##                     Max.   :11948958   Max.   :1.158e+77   Max.   :3139.4  
##  receipt_gas_used     totalfee            past_min          past_max       
##  Min.   :  13013   Min.   :0.000e+00   Min.   :   0.00   Min.   :   18.05  
##  1st Qu.:  21000   1st Qu.:1.428e+06   1st Qu.:  31.06   1st Qu.:   73.64  
##  Median :  37900   Median :2.960e+06   Median :  51.77   Median :  115.65  
##  Mean   :  67028   Mean   :6.234e+06   Mean   :  68.11   Mean   :  150.29  
##  3rd Qu.:  58384   3rd Qu.:6.402e+06   3rd Qu.:  87.50   3rd Qu.:  188.19  
##  Max.   :7672865   Max.   :1.104e+09   Max.   :1330.40   Max.   :14309.59  
##   past_median         blck_med          blck_min          blck_max       
##  Min.   :  13.00   Min.   :   0.00   Min.   :   0.00   Min.   :     1.0  
##  1st Qu.:  47.56   1st Qu.:  40.00   1st Qu.:  16.00   1st Qu.:   166.7  
##  Median :  74.08   Median :  68.00   Median :  42.00   Median :   259.5  
##  Mean   :  98.55   Mean   :  91.73   Mean   :  60.54   Mean   :   617.2  
##  3rd Qu.: 126.12   3rd Qu.: 118.00   3rd Qu.:  84.00   3rd Qu.:   500.0  
##  Max.   :1823.15   Max.   :3888.00   Max.   :1400.45   Max.   :210000.0  
##  blck_totalfee         blck_count      Eur_night         eth_high     
##  Min.   :5.630e+06   Min.   :  5.0   Min.   :0.0000   Min.   : 342.6  
##  1st Qu.:5.930e+08   1st Qu.:163.0   1st Qu.:0.0000   1st Qu.: 465.7  
##  Median :9.647e+08   Median :196.0   Median :1.0000   Median : 637.1  
##  Mean   :1.279e+09   Mean   :202.3   Mean   :0.5876   Mean   : 932.2  
##  3rd Qu.:1.622e+09   3rd Qu.:232.0   3rd Qu.:1.0000   3rd Qu.:1378.9  
##  Max.   :9.682e+10   Max.   :596.0   Max.   :1.0000   Max.   :2036.3

Looking at how much data is gas price over 800

sum(sample$gas_price > 800)/length(sample$gas_price)
## [1] 0.002202183
sum(sample$gas_price > 600)/length(sample$gas_price)
## [1] 0.005683875

remove all gas prices over 600

over600 <- which(sample_scale2$target > 600 | sample_scale2$target == 0)
sample_scale5 <- sample_scale2[-over600,]

refit model

model5 <- lm(target~., sample_scale5)
summary(model5)
## 
## Call:
## lm(formula = target ~ ., data = sample_scale5)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5633.4   -31.1   -10.3    17.3   875.0 
## 
## Coefficients:
##                   Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)       96.10637    0.22366  429.704  < 2e-16 ***
## receipt_gas_used -24.47749    0.19265 -127.059  < 2e-16 ***
## totalfee         122.33204    0.67673  180.770  < 2e-16 ***
## past_min           0.69243    0.35589    1.946   0.0517 .  
## past_max          -0.08713    0.21222   -0.411   0.6814    
## past_median        8.60782    0.41193   20.896  < 2e-16 ***
## eth_high          31.83786    0.18768  169.640  < 2e-16 ***
## Eur_night1         1.57567    0.29169    5.402  6.6e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 63.36 on 195011 degrees of freedom
## Multiple R-squared:  0.4048, Adjusted R-squared:  0.4048 
## F-statistic: 1.895e+04 on 7 and 195011 DF,  p-value: < 2.2e-16

Find best fit - backward

step(model5, direction = "backward", k = 2)
## Start:  AIC=1618208
## target ~ receipt_gas_used + totalfee + past_min + past_max + 
##     past_median + eth_high + Eur_night
## 
##                    Df Sum of Sq       RSS     AIC
## - past_max          1       677 782862144 1618207
## <none>                          782861468 1618208
## - past_min          1     15197 782876664 1618210
## - Eur_night         1    117141 782978608 1618236
## - past_median       1   1752926 784614393 1618643
## - receipt_gas_used  1  64808861 847670329 1633717
## - eth_high          1 115526317 898387785 1645050
## - totalfee          1 131183615 914045083 1648420
## 
## Step:  AIC=1618207
## target ~ receipt_gas_used + totalfee + past_min + past_median + 
##     eth_high + Eur_night
## 
##                    Df Sum of Sq       RSS     AIC
## <none>                          782862144 1618207
## - past_min          1     15598 782877742 1618208
## - Eur_night         1    117231 782979376 1618234
## - past_median       1   2150246 785012390 1618739
## - receipt_gas_used  1  64809554 847671698 1633716
## - eth_high          1 116514317 899376461 1645262
## - totalfee          1 131182990 914045134 1648418
## 
## Call:
## lm(formula = target ~ receipt_gas_used + totalfee + past_min + 
##     past_median + eth_high + Eur_night, data = sample_scale5)
## 
## Coefficients:
##      (Intercept)  receipt_gas_used          totalfee          past_min  
##          96.1060          -24.4776          122.3316            0.7004  
##      past_median          eth_high        Eur_night1  
##           8.5324           31.8448            1.5763

Without the large data, past_min matters.

Looking at forward model

basemodel2 <- lm(target~1,data=sample_scale5)
step(basemodel2, scope=list(upper=model5,lower=~1),
     direction="forward", k=2)
## Start:  AIC=1719387
## target ~ 1
## 
##                    Df Sum of Sq        RSS     AIC
## + eth_high          1 387068043  928257660 1651419
## + past_median       1 229525185 1085800517 1681990
## + past_min          1 200722764 1114602938 1687096
## + totalfee          1 136617127 1178708575 1698002
## + past_max          1 104790262 1210535440 1703198
## + receipt_gas_used  1   1121792 1314203910 1719222
## + Eur_night         1   1047792 1314277910 1719233
## <none>                          1315325702 1719387
## 
## Step:  AIC=1651419
## target ~ eth_high
## 
##                    Df Sum of Sq       RSS     AIC
## + totalfee          1  68491243 859766416 1636473
## + past_median       1  13728466 914529194 1648515
## + past_min          1  10879594 917378065 1649121
## + past_max          1   6082521 922175138 1650138
## + receipt_gas_used  1    299067 927958593 1651358
## + Eur_night         1    287690 927969970 1651360
## <none>                          928257660 1651419
## 
## Step:  AIC=1636473
## target ~ eth_high + totalfee
## 
##                    Df Sum of Sq       RSS     AIC
## + receipt_gas_used  1  66774158 792992258 1620708
## + past_median       1  11900391 847866026 1633756
## + past_min          1   9388195 850378222 1634333
## + past_max          1   5173823 854592593 1635298
## + Eur_night         1    270018 859496399 1636413
## <none>                          859766416 1636473
## 
## Step:  AIC=1620708
## target ~ eth_high + totalfee + receipt_gas_used
## 
##               Df Sum of Sq       RSS     AIC
## + past_median  1   9997250 782995008 1618236
## + past_min     1   7847310 785144948 1618770
## + past_max     1   4370946 788621312 1619632
## + Eur_night    1    191814 792800445 1620663
## <none>                     792992258 1620708
## 
## Step:  AIC=1618236
## target ~ eth_high + totalfee + receipt_gas_used + past_median
## 
##             Df Sum of Sq       RSS     AIC
## + Eur_night  1    117267 782877742 1618208
## + past_min   1     15633 782979376 1618234
## <none>                   782995008 1618236
## + past_max   1      1191 782993817 1618237
## 
## Step:  AIC=1618208
## target ~ eth_high + totalfee + receipt_gas_used + past_median + 
##     Eur_night
## 
##            Df Sum of Sq       RSS     AIC
## + past_min  1   15597.5 782862144 1618207
## <none>                  782877742 1618208
## + past_max  1    1077.5 782876664 1618210
## 
## Step:  AIC=1618207
## target ~ eth_high + totalfee + receipt_gas_used + past_median + 
##     Eur_night + past_min
## 
##            Df Sum of Sq       RSS     AIC
## <none>                  782862144 1618207
## + past_max  1    676.68 782861468 1618208
## 
## Call:
## lm(formula = target ~ eth_high + totalfee + receipt_gas_used + 
##     past_median + Eur_night + past_min, data = sample_scale5)
## 
## Coefficients:
##      (Intercept)          eth_high          totalfee  receipt_gas_used  
##          96.1060           31.8448          122.3316          -24.4776  
##      past_median        Eur_night1          past_min  
##           8.5324            1.5763            0.7004

Both models are identical - refit model (eth_high, total_fee, recept, past media, eurnight, past mind)

sample_scale6 <- sample_scale5[,-4]
model6 <- lm(target~.,sample_scale6 )
summary(model6)
## 
## Call:
## lm(formula = target ~ ., data = sample_scale6)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5633.4   -31.1   -10.3    17.3   875.0 
## 
## Coefficients:
##                  Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)       96.1060     0.2237  429.706  < 2e-16 ***
## receipt_gas_used -24.4776     0.1926 -127.060  < 2e-16 ***
## totalfee         122.3316     0.6767  180.770  < 2e-16 ***
## past_min           0.7004     0.3554    1.971   0.0487 *  
## past_median        8.5324     0.3687   23.144  < 2e-16 ***
## eth_high          31.8448     0.1869  170.364  < 2e-16 ***
## Eur_night1         1.5763     0.2917    5.404 6.53e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 63.36 on 195012 degrees of freedom
## Multiple R-squared:  0.4048, Adjusted R-squared:  0.4048 
## F-statistic: 2.211e+04 on 6 and 195012 DF,  p-value: < 2.2e-16
plot(model6)

boxcox(model6)

DFBetas Gas Amount

n2 = length(sample_scale6$target)
plot(dfbetas(model6)[,1], xlab="index", ylab="dfbeta_1", 
     pch=16, main="Gas amount", 
     cex=1.5, cex.main=1.5, cex.lab=1.5, cex.axis=1.5)
abline(h=2/sqrt(n2), col="red",lwd=3, lty=2)
abline(h=-2/sqrt(n2), col="red",lwd=3, lty=2)

b1 <- which(dfbetas(model6)[,1] > (2/sqrt(n2)) | dfbetas(model6)[,1] < (-2/sqrt(n2)) )
length(sample_scale6[b1, ]$target)
## [1] 7713
summary(sample[b1, ])
##   time_stamp         block_number           hash             gas_price      
##  Length:7713        Min.   :10966884   Min.   :1.651e+73   Min.   :    0.0  
##  Class :character   1st Qu.:11574703   1st Qu.:2.954e+76   1st Qu.:   74.1  
##  Mode  :character   Median :11747768   Median :5.794e+76   Median :  128.0  
##                     Mean   :11667058   Mean   :5.797e+76   Mean   :  160.6  
##                     3rd Qu.:11834203   3rd Qu.:8.676e+76   3rd Qu.:  193.0  
##                     Max.   :11941848   Max.   :1.158e+77   Max.   :21621.0  
##  receipt_gas_used     totalfee            past_min          past_max      
##  Min.   :  13013   Min.   :0.000e+00   Min.   :  1.195   Min.   :  24.08  
##  1st Qu.:  21000   1st Qu.:2.493e+06   1st Qu.: 49.828   1st Qu.: 110.35  
##  Median :  36170   Median :4.762e+06   Median : 83.578   Median : 172.20  
##  Mean   :  65899   Mean   :9.869e+06   Mean   : 96.144   Mean   : 200.87  
##  3rd Qu.:  57369   3rd Qu.:1.015e+07   3rd Qu.:125.050   3rd Qu.: 245.97  
##  Max.   :8655474   Max.   :1.104e+09   Max.   :763.853   Max.   :5527.15  
##   past_median         blck_med          blck_min         blck_max       
##  Min.   :  17.60   Min.   :   1.00   Min.   :  0.00   Min.   :    20.0  
##  1st Qu.:  73.82   1st Qu.:  64.64   1st Qu.: 16.00   1st Qu.:   206.4  
##  Median : 124.36   Median : 115.00   Median : 69.00   Median :   324.4  
##  Mean   : 140.29   Mean   : 131.94   Mean   : 85.48   Mean   :   683.8  
##  3rd Qu.: 179.29   3rd Qu.: 172.70   3rd Qu.:128.00   3rd Qu.:   550.0  
##  Max.   :1249.32   Max.   :2529.50   Max.   :845.00   Max.   :210000.0  
##  blck_totalfee         blck_count      Eur_night         eth_high     
##  Min.   :1.633e+07   Min.   : 11.0   Min.   :0.0000   Min.   : 342.6  
##  1st Qu.:9.254e+08   1st Qu.:167.0   1st Qu.:0.0000   1st Qu.: 754.3  
##  Median :1.551e+09   Median :202.0   Median :1.0000   Median :1382.7  
##  Mean   :1.787e+09   Mean   :210.5   Mean   :0.5684   Mean   :1322.6  
##  3rd Qu.:2.283e+09   3rd Qu.:241.0   3rd Qu.:1.0000   3rd Qu.:1781.4  
##  Max.   :3.353e+10   Max.   :595.0   Max.   :1.0000   Max.   :2036.3

Closer look

plot(dfbetas(model6)[,1], xlab="index", ylab="dfbeta_1", 
     pch=16, main="Gas amount", 
     cex=1.5, cex.main=1.5, cex.lab=1.5, cex.axis=1.5, ylim = c(-.1,.1))
abline(h=2/sqrt(n2), col="red",lwd=3, lty=2)
abline(h=-2/sqrt(n2), col="red",lwd=3, lty=2)

DFBetas Total Fee

plot(dfbetas(model6)[,2], xlab="index", ylab="dfbeta_2", 
     pch=16, main="total fee", 
     cex=1.5, cex.main=1.5, cex.lab=1.5, cex.axis=1.5)
abline(h=2/sqrt(n2), col="red",lwd=3, lty=2)
abline(h=-2/sqrt(n2), col="red",lwd=3, lty=2)

b2 <- which(dfbetas(model6)[,2] > (2/sqrt(n2)) | dfbetas(model6)[,2] < (-2/sqrt(n2)) )
length(sample_scale6[b2, ]$target)
## [1] 4165
summary(sample[b2, ])
##   time_stamp         block_number           hash             gas_price      
##  Length:4165        Min.   :10967706   Min.   :1.651e+73   Min.   :    1.0  
##  Class :character   1st Qu.:11332601   1st Qu.:3.053e+76   1st Qu.:   57.0  
##  Mode  :character   Median :11607413   Median :5.896e+76   Median :  111.0  
##                     Mean   :11557751   Mean   :5.817e+76   Mean   :  150.1  
##                     3rd Qu.:11807857   3rd Qu.:8.666e+76   3rd Qu.:  182.0  
##                     Max.   :11943217   Max.   :1.157e+77   Max.   :21621.0  
##  receipt_gas_used     totalfee            past_min          past_max      
##  Min.   :  13013   Min.   :    21000   Min.   :  1.195   Min.   :  23.46  
##  1st Qu.:  21000   1st Qu.:  2095760   1st Qu.: 40.703   1st Qu.:  92.53  
##  Median :  36712   Median :  4181077   Median : 68.760   Median : 153.20  
##  Mean   :  63157   Mean   :  8797086   Mean   : 85.191   Mean   : 185.88  
##  3rd Qu.:  57369   3rd Qu.:  9171255   3rd Qu.:110.422   3rd Qu.: 224.43  
##  Max.   :2716773   Max.   :454040871   Max.   :745.720   Max.   :2792.06  
##   past_median         blck_med         blck_min         blck_max      
##  Min.   :  15.48   Min.   :   1.0   Min.   :  0.00   Min.   :   18.7  
##  1st Qu.:  61.48   1st Qu.:  52.0   1st Qu.: 16.10   1st Qu.:  200.0  
##  Median : 104.94   Median :  93.0   Median : 55.20   Median :  303.2  
##  Mean   : 125.21   Mean   : 115.8   Mean   : 75.58   Mean   :  637.3  
##  3rd Qu.: 157.68   3rd Qu.: 150.0   3rd Qu.:110.00   3rd Qu.:  510.1  
##  Max.   :1472.47   Max.   :1918.2   Max.   :845.00   Max.   :32858.3  
##  blck_totalfee         blck_count      Eur_night        eth_high     
##  Min.   :1.725e+07   Min.   : 20.0   Min.   :0.000   Min.   : 342.6  
##  1st Qu.:7.556e+08   1st Qu.:167.0   1st Qu.:0.000   1st Qu.: 579.4  
##  Median :1.302e+09   Median :202.0   Median :1.000   Median :1209.4  
##  Mean   :1.601e+09   Mean   :208.6   Mean   :0.606   Mean   :1120.2  
##  3rd Qu.:2.023e+09   3rd Qu.:238.0   3rd Qu.:1.000   3rd Qu.:1690.0  
##  Max.   :3.143e+10   Max.   :594.0   Max.   :1.000   Max.   :2036.3

Closer look

plot(dfbetas(model6)[,2], xlab="index", ylab="dfbeta_1", 
     pch=16, main="Total Fee", 
     cex=1.5, cex.main=1.5, cex.lab=1.5, cex.axis=1.5, ylim = c(-1,3))
abline(h=2/sqrt(n2), col="red",lwd=3, lty=2)
abline(h=-2/sqrt(n2), col="red",lwd=3, lty=2)

DFBetas Past Min

plot(dfbetas(model6)[,3], xlab="index", ylab="dfbeta_2", 
     pch=16, main="Past Min", 
     cex=1.5, cex.main=1.5, cex.lab=1.5, cex.axis=1.5)
abline(h=2/sqrt(n2), col="red",lwd=3, lty=2)
abline(h=-2/sqrt(n2), col="red",lwd=3, lty=2)

b3 <- which(dfbetas(model6)[,3] > (2/sqrt(n2)) | dfbetas(model6)[,3] < (-2/sqrt(n2)) )
length(sample_scale6[b3, ]$target)
## [1] 5443
summary(sample[b3, ])
##   time_stamp         block_number           hash             gas_price       
##  Length:5443        Min.   :10966888   Min.   :1.651e+73   Min.   :    1.00  
##  Class :character   1st Qu.:11403401   1st Qu.:3.032e+76   1st Qu.:   64.06  
##  Mode  :character   Median :11632202   Median :5.899e+76   Median :  125.00  
##                     Mean   :11594900   Mean   :5.829e+76   Mean   :  157.42  
##                     3rd Qu.:11827010   3rd Qu.:8.725e+76   3rd Qu.:  197.00  
##                     Max.   :11943217   Max.   :1.158e+77   Max.   :21620.99  
##  receipt_gas_used     totalfee            past_min          past_max      
##  Min.   :  13141   Min.   :    21000   Min.   :  1.194   Min.   :  23.82  
##  1st Qu.:  21000   1st Qu.:  2349268   1st Qu.: 44.244   1st Qu.: 101.15  
##  Median :  36824   Median :  4545450   Median : 77.542   Median : 166.41  
##  Mean   :  62907   Mean   :  9412905   Mean   : 92.528   Mean   : 196.57  
##  3rd Qu.:  57369   3rd Qu.: 10068764   3rd Qu.:121.866   3rd Qu.: 240.12  
##  Max.   :2716773   Max.   :454040871   Max.   :763.853   Max.   :5527.15  
##   past_median         blck_med          blck_min         blck_max      
##  Min.   :  17.59   Min.   :   1.00   Min.   :  0.00   Min.   :   18.7  
##  1st Qu.:  66.88   1st Qu.:  56.64   1st Qu.: 17.10   1st Qu.:  207.9  
##  Median : 118.29   Median : 106.00   Median : 63.00   Median :  326.2  
##  Mean   : 135.16   Mean   : 125.95   Mean   : 83.31   Mean   :  669.4  
##  3rd Qu.: 172.66   3rd Qu.: 166.30   3rd Qu.:123.75   3rd Qu.:  547.5  
##  Max.   :1472.47   Max.   :1918.24   Max.   :845.00   Max.   :32858.3  
##  blck_totalfee         blck_count      Eur_night        eth_high     
##  Min.   :1.633e+07   Min.   : 25.0   Min.   :0.000   Min.   : 342.6  
##  1st Qu.:8.352e+08   1st Qu.:168.0   1st Qu.:0.000   1st Qu.: 610.0  
##  Median :1.462e+09   Median :203.0   Median :1.000   Median :1282.6  
##  Mean   :1.722e+09   Mean   :210.0   Mean   :0.605   Mean   :1199.8  
##  3rd Qu.:2.212e+09   3rd Qu.:239.5   3rd Qu.:1.000   3rd Qu.:1770.6  
##  Max.   :3.143e+10   Max.   :594.0   Max.   :1.000   Max.   :2036.3
plot(dfbetas(model6)[,3], xlab="index", ylab="dfbeta_3", 
     pch=16, main="Past Min", 
     cex=1.5, cex.main=1.5, cex.lab=1.5, cex.axis=1.5, ylim = c(-.1, .1))
abline(h=2/sqrt(n2), col="red",lwd=3, lty=2)
abline(h=-2/sqrt(n2), col="red",lwd=3, lty=2)

DFFits

k3 = 6
plot(dffits(model6), xlab="index", ylab="dffits", 
     pch=16, main="DFFITS ", 
     cex=1.5, cex.main=2, cex.lab=1.5, cex.axis=1.5)
abline(h=2*sqrt((k3+1)/n2), col="red",lwd=3, lty=2)
abline(h=-2*sqrt((k3+1)/n2), col="red",lwd=3, lty=2)

yinfluence <- which(dffits(model6) > (2*sqrt((k3+1)/n2)) |dffits(model6) < (-2*sqrt((k3+1)/n2)) )
length(sample_scale6[yinfluence, ]$target)
## [1] 9798
summary(sample[yinfluence, ])
##   time_stamp         block_number           hash             gas_price      
##  Length:9798        Min.   :10967706   Min.   :1.651e+73   Min.   :    1.0  
##  Class :character   1st Qu.:11585838   1st Qu.:2.844e+76   1st Qu.:   91.0  
##  Mode  :character   Median :11794648   Median :5.769e+76   Median :  146.0  
##                     Mean   :11687803   Mean   :5.752e+76   Mean   :  178.6  
##                     3rd Qu.:11872034   3rd Qu.:8.675e+76   3rd Qu.:  211.0  
##                     Max.   :11943217   Max.   :1.158e+77   Max.   :21621.0  
##  receipt_gas_used     totalfee            past_min          past_max      
##  Min.   :  13013   Min.   :2.100e+04   Min.   :  1.023   Min.   :  23.82  
##  1st Qu.:  21000   1st Qu.:2.884e+06   1st Qu.: 57.965   1st Qu.: 136.58  
##  Median :  36294   Median :5.158e+06   Median : 97.925   Median : 194.45  
##  Mean   :  63778   Mean   :1.090e+07   Mean   :108.367   Mean   : 225.12  
##  3rd Qu.:  57381   3rd Qu.:1.156e+07   3rd Qu.:138.835   3rd Qu.: 265.64  
##  Max.   :5460163   Max.   :1.104e+09   Max.   :895.098   Max.   :5527.15  
##   past_median         blck_med         blck_min          blck_max       
##  Min.   :  17.85   Min.   :   1.0   Min.   :   0.00   Min.   :    18.7  
##  1st Qu.:  93.95   1st Qu.:  80.0   1st Qu.:  16.67   1st Qu.:   234.0  
##  Median : 144.40   Median : 133.2   Median :  90.00   Median :   360.0  
##  Mean   : 159.83   Mean   : 149.8   Mean   :  97.83   Mean   :   767.3  
##  3rd Qu.: 196.65   3rd Qu.: 191.3   3rd Qu.: 145.20   3rd Qu.:   594.5  
##  Max.   :1687.49   Max.   :2529.5   Max.   :1400.45   Max.   :210000.0  
##  blck_totalfee         blck_count      Eur_night         eth_high     
##  Min.   :1.633e+07   Min.   : 25.0   Min.   :0.0000   Min.   : 342.6  
##  1st Qu.:1.159e+09   1st Qu.:171.0   1st Qu.:0.0000   1st Qu.:1134.3  
##  Median :1.792e+09   Median :206.0   Median :1.0000   Median :1689.2  
##  Mean   :2.040e+09   Mean   :214.2   Mean   :0.6042   Mean   :1416.1  
##  3rd Qu.:2.530e+09   3rd Qu.:244.0   3rd Qu.:1.0000   3rd Qu.:1833.8  
##  Max.   :3.353e+10   Max.   :594.0   Max.   :1.0000   Max.   :2036.3

Removing everything greater than 350 and 0

outliers3 <- which(sample_scale2$target > 350 | sample_scale2$target == 0)
sample_scale7 <- sample_scale2[-outliers3,]
model7 <- lm(target~.,sample_scale7)
summary(model7)
## 
## Call:
## lm(formula = target ~ ., data = sample_scale7)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4259.6   -28.0    -8.0    20.1   633.2 
## 
## Coefficients:
##                  Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)       90.7715     0.1799  504.579  < 2e-16 ***
## receipt_gas_used -18.1512     0.1607 -112.961  < 2e-16 ***
## totalfee          92.8845     0.6114  151.927  < 2e-16 ***
## past_min           1.7774     0.2909    6.110 9.99e-10 ***
## past_max          -0.3080     0.1706   -1.806    0.071 .  
## past_median        6.6169     0.3357   19.713  < 2e-16 ***
## eth_high          30.1773     0.1520  198.555  < 2e-16 ***
## Eur_night1         1.5414     0.2343    6.578 4.77e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 50.45 on 191577 degrees of freedom
## Multiple R-squared:  0.4322, Adjusted R-squared:  0.4322 
## F-statistic: 2.083e+04 on 7 and 191577 DF,  p-value: < 2.2e-16
plot(model7)

step(model7, direction = "backward", k = 2)
## Start:  AIC=1502422
## target ~ receipt_gas_used + totalfee + past_min + past_max + 
##     past_median + eth_high + Eur_night
## 
##                    Df Sum of Sq       RSS     AIC
## <none>                          487631610 1502422
## - past_max          1      8299 487639909 1502424
## - past_min          1     95021 487726631 1502458
## - Eur_night         1    110147 487741757 1502464
## - past_median       1    989162 488620772 1502809
## - receipt_gas_used  1  32479075 520110684 1514774
## - totalfee          1  58751470 546383079 1524215
## - eth_high          1 100347856 587979465 1538272
## 
## Call:
## lm(formula = target ~ receipt_gas_used + totalfee + past_min + 
##     past_max + past_median + eth_high + Eur_night, data = sample_scale7)
## 
## Coefficients:
##      (Intercept)  receipt_gas_used          totalfee          past_min  
##           90.771           -18.151            92.885             1.777  
##         past_max       past_median          eth_high        Eur_night1  
##           -0.308             6.617            30.177             1.541