##Εισαγωγή Δεδομένων
data <- read.csv("creditcard.csv")
glimpse(data)
## Rows: 284,807
## Columns: 31
## $ Time <dbl> 0, 0, 1, 1, 2, 2, 4, 7, 7, 9, 10, 10, 10, 11, 12, 12, 12, 13, 1…
## $ V1 <dbl> -1.3598071, 1.1918571, -1.3583541, -0.9662717, -1.1582331, -0.4…
## $ V2 <dbl> -0.07278117, 0.26615071, -1.34016307, -0.18522601, 0.87773675, …
## $ V3 <dbl> 2.53634674, 0.16648011, 1.77320934, 1.79299334, 1.54871785, 1.1…
## $ V4 <dbl> 1.37815522, 0.44815408, 0.37977959, -0.86329128, 0.40303393, -0…
## $ V5 <dbl> -0.33832077, 0.06001765, -0.50319813, -0.01030888, -0.40719338,…
## $ V6 <dbl> 0.46238778, -0.08236081, 1.80049938, 1.24720317, 0.09592146, -0…
## $ V7 <dbl> 0.239598554, -0.078802983, 0.791460956, 0.237608940, 0.59294074…
## $ V8 <dbl> 0.098697901, 0.085101655, 0.247675787, 0.377435875, -0.27053267…
## $ V9 <dbl> 0.3637870, -0.2554251, -1.5146543, -1.3870241, 0.8177393, -0.56…
## $ V10 <dbl> 0.09079417, -0.16697441, 0.20764287, -0.05495192, 0.75307443, -…
## $ V11 <dbl> -0.55159953, 1.61272666, 0.62450146, -0.22648726, -0.82284288, …
## $ V12 <dbl> -0.61780086, 1.06523531, 0.06608369, 0.17822823, 0.53819555, 0.…
## $ V13 <dbl> -0.99138985, 0.48909502, 0.71729273, 0.50775687, 1.34585159, -0…
## $ V14 <dbl> -0.31116935, -0.14377230, -0.16594592, -0.28792375, -1.11966983…
## $ V15 <dbl> 1.468176972, 0.635558093, 2.345864949, -0.631418118, 0.17512113…
## $ V16 <dbl> -0.47040053, 0.46391704, -2.89008319, -1.05964725, -0.45144918,…
## $ V17 <dbl> 0.207971242, -0.114804663, 1.109969379, -0.684092786, -0.237033…
## $ V18 <dbl> 0.02579058, -0.18336127, -0.12135931, 1.96577500, -0.03819479, …
## $ V19 <dbl> 0.40399296, -0.14578304, -2.26185710, -1.23262197, 0.80348692, …
## $ V20 <dbl> 0.25141210, -0.06908314, 0.52497973, -0.20803778, 0.40854236, 0…
## $ V21 <dbl> -0.018306778, -0.225775248, 0.247998153, -0.108300452, -0.00943…
## $ V22 <dbl> 0.277837576, -0.638671953, 0.771679402, 0.005273597, 0.79827849…
## $ V23 <dbl> -0.110473910, 0.101288021, 0.909412262, -0.190320519, -0.137458…
## $ V24 <dbl> 0.06692807, -0.33984648, -0.68928096, -1.17557533, 0.14126698, …
## $ V25 <dbl> 0.12853936, 0.16717040, -0.32764183, 0.64737603, -0.20600959, -…
## $ V26 <dbl> -0.18911484, 0.12589453, -0.13909657, -0.22192884, 0.50229222, …
## $ V27 <dbl> 0.133558377, -0.008983099, -0.055352794, 0.062722849, 0.2194222…
## $ V28 <dbl> -0.021053053, 0.014724169, -0.059751841, 0.061457629, 0.2151531…
## $ Amount <dbl> 149.62, 2.69, 378.66, 123.50, 69.99, 3.67, 4.99, 40.80, 93.20, …
## $ Class <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
summary(data)
## Time V1 V2 V3
## Min. : 0 Min. :-56.40751 Min. :-72.71573 Min. :-48.3256
## 1st Qu.: 54202 1st Qu.: -0.92037 1st Qu.: -0.59855 1st Qu.: -0.8904
## Median : 84692 Median : 0.01811 Median : 0.06549 Median : 0.1799
## Mean : 94814 Mean : 0.00000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.:139321 3rd Qu.: 1.31564 3rd Qu.: 0.80372 3rd Qu.: 1.0272
## Max. :172792 Max. : 2.45493 Max. : 22.05773 Max. : 9.3826
## V4 V5 V6 V7
## Min. :-5.68317 Min. :-113.74331 Min. :-26.1605 Min. :-43.5572
## 1st Qu.:-0.84864 1st Qu.: -0.69160 1st Qu.: -0.7683 1st Qu.: -0.5541
## Median :-0.01985 Median : -0.05434 Median : -0.2742 Median : 0.0401
## Mean : 0.00000 Mean : 0.00000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.74334 3rd Qu.: 0.61193 3rd Qu.: 0.3986 3rd Qu.: 0.5704
## Max. :16.87534 Max. : 34.80167 Max. : 73.3016 Max. :120.5895
## V8 V9 V10 V11
## Min. :-73.21672 Min. :-13.43407 Min. :-24.58826 Min. :-4.79747
## 1st Qu.: -0.20863 1st Qu.: -0.64310 1st Qu.: -0.53543 1st Qu.:-0.76249
## Median : 0.02236 Median : -0.05143 Median : -0.09292 Median :-0.03276
## Mean : 0.00000 Mean : 0.00000 Mean : 0.00000 Mean : 0.00000
## 3rd Qu.: 0.32735 3rd Qu.: 0.59714 3rd Qu.: 0.45392 3rd Qu.: 0.73959
## Max. : 20.00721 Max. : 15.59500 Max. : 23.74514 Max. :12.01891
## V12 V13 V14 V15
## Min. :-18.6837 Min. :-5.79188 Min. :-19.2143 Min. :-4.49894
## 1st Qu.: -0.4056 1st Qu.:-0.64854 1st Qu.: -0.4256 1st Qu.:-0.58288
## Median : 0.1400 Median :-0.01357 Median : 0.0506 Median : 0.04807
## Mean : 0.0000 Mean : 0.00000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 0.6182 3rd Qu.: 0.66251 3rd Qu.: 0.4931 3rd Qu.: 0.64882
## Max. : 7.8484 Max. : 7.12688 Max. : 10.5268 Max. : 8.87774
## V16 V17 V18
## Min. :-14.12985 Min. :-25.16280 Min. :-9.498746
## 1st Qu.: -0.46804 1st Qu.: -0.48375 1st Qu.:-0.498850
## Median : 0.06641 Median : -0.06568 Median :-0.003636
## Mean : 0.00000 Mean : 0.00000 Mean : 0.000000
## 3rd Qu.: 0.52330 3rd Qu.: 0.39968 3rd Qu.: 0.500807
## Max. : 17.31511 Max. : 9.25353 Max. : 5.041069
## V19 V20 V21
## Min. :-7.213527 Min. :-54.49772 Min. :-34.83038
## 1st Qu.:-0.456299 1st Qu.: -0.21172 1st Qu.: -0.22839
## Median : 0.003735 Median : -0.06248 Median : -0.02945
## Mean : 0.000000 Mean : 0.00000 Mean : 0.00000
## 3rd Qu.: 0.458949 3rd Qu.: 0.13304 3rd Qu.: 0.18638
## Max. : 5.591971 Max. : 39.42090 Max. : 27.20284
## V22 V23 V24
## Min. :-10.933144 Min. :-44.80774 Min. :-2.83663
## 1st Qu.: -0.542350 1st Qu.: -0.16185 1st Qu.:-0.35459
## Median : 0.006782 Median : -0.01119 Median : 0.04098
## Mean : 0.000000 Mean : 0.00000 Mean : 0.00000
## 3rd Qu.: 0.528554 3rd Qu.: 0.14764 3rd Qu.: 0.43953
## Max. : 10.503090 Max. : 22.52841 Max. : 4.58455
## V25 V26 V27
## Min. :-10.29540 Min. :-2.60455 Min. :-22.565679
## 1st Qu.: -0.31715 1st Qu.:-0.32698 1st Qu.: -0.070840
## Median : 0.01659 Median :-0.05214 Median : 0.001342
## Mean : 0.00000 Mean : 0.00000 Mean : 0.000000
## 3rd Qu.: 0.35072 3rd Qu.: 0.24095 3rd Qu.: 0.091045
## Max. : 7.51959 Max. : 3.51735 Max. : 31.612198
## V28 Amount Class
## Min. :-15.43008 Min. : 0.00 Min. :0.000000
## 1st Qu.: -0.05296 1st Qu.: 5.60 1st Qu.:0.000000
## Median : 0.01124 Median : 22.00 Median :0.000000
## Mean : 0.00000 Mean : 88.35 Mean :0.001728
## 3rd Qu.: 0.07828 3rd Qu.: 77.17 3rd Qu.:0.000000
## Max. : 33.84781 Max. :25691.16 Max. :1.000000
##Εξερεύνηση Δεδομένων
df <- data %>% select(Time, V1, V2, V3, V10, Amount, Class)
ggplot(df, aes(x = V1, y = Amount)) +
geom_point(alpha=0.4) +
labs(title = "Σχέση μεταξύ V1 και Ποσού")
ggplot(df, aes(x = V2, y = Amount)) +
geom_point(alpha=0.4) +
labs(title = "Σχέση μεταξύ V2 και Ποσού")
ggplot(df, aes(x = as.factor(Class), y = Amount)) +
geom_boxplot() +
labs(title = "Boxplot Ποσού ανά Κατηγορία (Απάτη / Όχι)", x = "Class")
##Απλή Γραμμική Παλινδρόμηση
model1 <- lm(Amount ~ V1, data = df)
summary(model1)
##
## Call:
## lm(formula = Amount ~ V1, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1147.0 -90.1 -39.6 8.0 24569.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 88.3496 0.4564 193.6 <2e-16 ***
## V1 -29.0778 0.2330 -124.8 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 243.5 on 284805 degrees of freedom
## Multiple R-squared: 0.05185, Adjusted R-squared: 0.05185
## F-statistic: 1.558e+04 on 1 and 284805 DF, p-value: < 2.2e-16
SSE1 <- sum(model1$residuals^2)
SSE1
## [1] 16893624603
model2 <- lm(Amount ~ V1 + V2, data = df)
summary(model2)
##
## Call:
## lm(formula = Amount ~ V1 + V2, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6379.7 -52.9 -19.0 20.2 22005.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 88.3496 0.3824 231.0 <2e-16 ***
## V1 -29.0778 0.1952 -148.9 <2e-16 ***
## V2 -80.4914 0.2316 -347.6 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 204.1 on 284804 degrees of freedom
## Multiple R-squared: 0.3342, Adjusted R-squared: 0.3342
## F-statistic: 7.149e+04 on 2 and 284804 DF, p-value: < 2.2e-16
SSE2 <- sum(model2$residuals^2)
SSE2
## [1] 11862048274
model3 <- lm(Amount ~ V1 + V2 + V3, data = df)
summary(model3)
##
## Call:
## lm(formula = Amount ~ V1 + V2 + V3, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6609.5 -60.8 -13.1 32.3 20324.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 88.3496 0.3694 239.2 <2e-16 ***
## V1 -29.0778 0.1886 -154.2 <2e-16 ***
## V2 -80.4914 0.2237 -359.8 <2e-16 ***
## V3 -34.7867 0.2436 -142.8 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 197.1 on 284803 degrees of freedom
## Multiple R-squared: 0.3787, Adjusted R-squared: 0.3787
## F-statistic: 5.787e+04 on 3 and 284803 DF, p-value: < 2.2e-16
SSE3 <- sum(model3$residuals^2)
SSE3
## [1] 11069694562
##Σύγκριση Μοντέλων
glance(model1)
## # A tibble: 1 × 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.0519 0.0518 244. 15575. 0 1 -1969229. 3.94e6 3.94e6
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
glance(model2)
## # A tibble: 1 × 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.334 0.334 204. 71494. 0 2 -1918876. 3.84e6 3.84e6
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
glance(model3)
## # A tibble: 1 × 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.379 0.379 197. 57869. 0 3 -1909031. 3.82e6 3.82e6
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
df_models <- tibble(
Model = c("Model1", "Model2", "Model3"),
R2 = c(summary(model1)$r.squared, summary(model2)$r.squared, summary(model3)$r.squared),
SSE = c(SSE1, SSE2, SSE3)
)
df_models %>%
pivot_longer(-Model) %>%
ggplot(aes(x = Model, y = value, fill = name)) +
geom_bar(stat="identity", position="dodge") +
labs(title = "Σύγκριση R-squared και SSE ανά Μοντέλο", y = "")
##Συμπεράσματα
V1 έχει κάποια εξηγητική δύναμη για το
Amount, αλλά η προσθήκη των V2 και
V3 βελτιώνει σημαντικά το μοντέλο.