##Εισαγωγή Δεδομένων

data <- read.csv("creditcard.csv")
glimpse(data)
## Rows: 284,807
## Columns: 31
## $ Time   <dbl> 0, 0, 1, 1, 2, 2, 4, 7, 7, 9, 10, 10, 10, 11, 12, 12, 12, 13, 1…
## $ V1     <dbl> -1.3598071, 1.1918571, -1.3583541, -0.9662717, -1.1582331, -0.4…
## $ V2     <dbl> -0.07278117, 0.26615071, -1.34016307, -0.18522601, 0.87773675, …
## $ V3     <dbl> 2.53634674, 0.16648011, 1.77320934, 1.79299334, 1.54871785, 1.1…
## $ V4     <dbl> 1.37815522, 0.44815408, 0.37977959, -0.86329128, 0.40303393, -0…
## $ V5     <dbl> -0.33832077, 0.06001765, -0.50319813, -0.01030888, -0.40719338,…
## $ V6     <dbl> 0.46238778, -0.08236081, 1.80049938, 1.24720317, 0.09592146, -0…
## $ V7     <dbl> 0.239598554, -0.078802983, 0.791460956, 0.237608940, 0.59294074…
## $ V8     <dbl> 0.098697901, 0.085101655, 0.247675787, 0.377435875, -0.27053267…
## $ V9     <dbl> 0.3637870, -0.2554251, -1.5146543, -1.3870241, 0.8177393, -0.56…
## $ V10    <dbl> 0.09079417, -0.16697441, 0.20764287, -0.05495192, 0.75307443, -…
## $ V11    <dbl> -0.55159953, 1.61272666, 0.62450146, -0.22648726, -0.82284288, …
## $ V12    <dbl> -0.61780086, 1.06523531, 0.06608369, 0.17822823, 0.53819555, 0.…
## $ V13    <dbl> -0.99138985, 0.48909502, 0.71729273, 0.50775687, 1.34585159, -0…
## $ V14    <dbl> -0.31116935, -0.14377230, -0.16594592, -0.28792375, -1.11966983…
## $ V15    <dbl> 1.468176972, 0.635558093, 2.345864949, -0.631418118, 0.17512113…
## $ V16    <dbl> -0.47040053, 0.46391704, -2.89008319, -1.05964725, -0.45144918,…
## $ V17    <dbl> 0.207971242, -0.114804663, 1.109969379, -0.684092786, -0.237033…
## $ V18    <dbl> 0.02579058, -0.18336127, -0.12135931, 1.96577500, -0.03819479, …
## $ V19    <dbl> 0.40399296, -0.14578304, -2.26185710, -1.23262197, 0.80348692, …
## $ V20    <dbl> 0.25141210, -0.06908314, 0.52497973, -0.20803778, 0.40854236, 0…
## $ V21    <dbl> -0.018306778, -0.225775248, 0.247998153, -0.108300452, -0.00943…
## $ V22    <dbl> 0.277837576, -0.638671953, 0.771679402, 0.005273597, 0.79827849…
## $ V23    <dbl> -0.110473910, 0.101288021, 0.909412262, -0.190320519, -0.137458…
## $ V24    <dbl> 0.06692807, -0.33984648, -0.68928096, -1.17557533, 0.14126698, …
## $ V25    <dbl> 0.12853936, 0.16717040, -0.32764183, 0.64737603, -0.20600959, -…
## $ V26    <dbl> -0.18911484, 0.12589453, -0.13909657, -0.22192884, 0.50229222, …
## $ V27    <dbl> 0.133558377, -0.008983099, -0.055352794, 0.062722849, 0.2194222…
## $ V28    <dbl> -0.021053053, 0.014724169, -0.059751841, 0.061457629, 0.2151531…
## $ Amount <dbl> 149.62, 2.69, 378.66, 123.50, 69.99, 3.67, 4.99, 40.80, 93.20, …
## $ Class  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
summary(data)
##       Time              V1                  V2                  V3          
##  Min.   :     0   Min.   :-56.40751   Min.   :-72.71573   Min.   :-48.3256  
##  1st Qu.: 54202   1st Qu.: -0.92037   1st Qu.: -0.59855   1st Qu.: -0.8904  
##  Median : 84692   Median :  0.01811   Median :  0.06549   Median :  0.1799  
##  Mean   : 94814   Mean   :  0.00000   Mean   :  0.00000   Mean   :  0.0000  
##  3rd Qu.:139321   3rd Qu.:  1.31564   3rd Qu.:  0.80372   3rd Qu.:  1.0272  
##  Max.   :172792   Max.   :  2.45493   Max.   : 22.05773   Max.   :  9.3826  
##        V4                 V5                   V6                 V7          
##  Min.   :-5.68317   Min.   :-113.74331   Min.   :-26.1605   Min.   :-43.5572  
##  1st Qu.:-0.84864   1st Qu.:  -0.69160   1st Qu.: -0.7683   1st Qu.: -0.5541  
##  Median :-0.01985   Median :  -0.05434   Median : -0.2742   Median :  0.0401  
##  Mean   : 0.00000   Mean   :   0.00000   Mean   :  0.0000   Mean   :  0.0000  
##  3rd Qu.: 0.74334   3rd Qu.:   0.61193   3rd Qu.:  0.3986   3rd Qu.:  0.5704  
##  Max.   :16.87534   Max.   :  34.80167   Max.   : 73.3016   Max.   :120.5895  
##        V8                  V9                 V10                 V11          
##  Min.   :-73.21672   Min.   :-13.43407   Min.   :-24.58826   Min.   :-4.79747  
##  1st Qu.: -0.20863   1st Qu.: -0.64310   1st Qu.: -0.53543   1st Qu.:-0.76249  
##  Median :  0.02236   Median : -0.05143   Median : -0.09292   Median :-0.03276  
##  Mean   :  0.00000   Mean   :  0.00000   Mean   :  0.00000   Mean   : 0.00000  
##  3rd Qu.:  0.32735   3rd Qu.:  0.59714   3rd Qu.:  0.45392   3rd Qu.: 0.73959  
##  Max.   : 20.00721   Max.   : 15.59500   Max.   : 23.74514   Max.   :12.01891  
##       V12                V13                V14                V15          
##  Min.   :-18.6837   Min.   :-5.79188   Min.   :-19.2143   Min.   :-4.49894  
##  1st Qu.: -0.4056   1st Qu.:-0.64854   1st Qu.: -0.4256   1st Qu.:-0.58288  
##  Median :  0.1400   Median :-0.01357   Median :  0.0506   Median : 0.04807  
##  Mean   :  0.0000   Mean   : 0.00000   Mean   :  0.0000   Mean   : 0.00000  
##  3rd Qu.:  0.6182   3rd Qu.: 0.66251   3rd Qu.:  0.4931   3rd Qu.: 0.64882  
##  Max.   :  7.8484   Max.   : 7.12688   Max.   : 10.5268   Max.   : 8.87774  
##       V16                 V17                 V18           
##  Min.   :-14.12985   Min.   :-25.16280   Min.   :-9.498746  
##  1st Qu.: -0.46804   1st Qu.: -0.48375   1st Qu.:-0.498850  
##  Median :  0.06641   Median : -0.06568   Median :-0.003636  
##  Mean   :  0.00000   Mean   :  0.00000   Mean   : 0.000000  
##  3rd Qu.:  0.52330   3rd Qu.:  0.39968   3rd Qu.: 0.500807  
##  Max.   : 17.31511   Max.   :  9.25353   Max.   : 5.041069  
##       V19                 V20                 V21           
##  Min.   :-7.213527   Min.   :-54.49772   Min.   :-34.83038  
##  1st Qu.:-0.456299   1st Qu.: -0.21172   1st Qu.: -0.22839  
##  Median : 0.003735   Median : -0.06248   Median : -0.02945  
##  Mean   : 0.000000   Mean   :  0.00000   Mean   :  0.00000  
##  3rd Qu.: 0.458949   3rd Qu.:  0.13304   3rd Qu.:  0.18638  
##  Max.   : 5.591971   Max.   : 39.42090   Max.   : 27.20284  
##       V22                  V23                 V24          
##  Min.   :-10.933144   Min.   :-44.80774   Min.   :-2.83663  
##  1st Qu.: -0.542350   1st Qu.: -0.16185   1st Qu.:-0.35459  
##  Median :  0.006782   Median : -0.01119   Median : 0.04098  
##  Mean   :  0.000000   Mean   :  0.00000   Mean   : 0.00000  
##  3rd Qu.:  0.528554   3rd Qu.:  0.14764   3rd Qu.: 0.43953  
##  Max.   : 10.503090   Max.   : 22.52841   Max.   : 4.58455  
##       V25                 V26                V27            
##  Min.   :-10.29540   Min.   :-2.60455   Min.   :-22.565679  
##  1st Qu.: -0.31715   1st Qu.:-0.32698   1st Qu.: -0.070840  
##  Median :  0.01659   Median :-0.05214   Median :  0.001342  
##  Mean   :  0.00000   Mean   : 0.00000   Mean   :  0.000000  
##  3rd Qu.:  0.35072   3rd Qu.: 0.24095   3rd Qu.:  0.091045  
##  Max.   :  7.51959   Max.   : 3.51735   Max.   : 31.612198  
##       V28                Amount             Class         
##  Min.   :-15.43008   Min.   :    0.00   Min.   :0.000000  
##  1st Qu.: -0.05296   1st Qu.:    5.60   1st Qu.:0.000000  
##  Median :  0.01124   Median :   22.00   Median :0.000000  
##  Mean   :  0.00000   Mean   :   88.35   Mean   :0.001728  
##  3rd Qu.:  0.07828   3rd Qu.:   77.17   3rd Qu.:0.000000  
##  Max.   : 33.84781   Max.   :25691.16   Max.   :1.000000

##Εξερεύνηση Δεδομένων

df <- data %>% select(Time, V1, V2, V3, V10, Amount, Class)

ggplot(df, aes(x = V1, y = Amount)) + 
  geom_point(alpha=0.4) + 
  labs(title = "Σχέση μεταξύ V1 και Ποσού")

ggplot(df, aes(x = V2, y = Amount)) + 
  geom_point(alpha=0.4) + 
  labs(title = "Σχέση μεταξύ V2 και Ποσού")

ggplot(df, aes(x = as.factor(Class), y = Amount)) + 
  geom_boxplot() +
  labs(title = "Boxplot Ποσού ανά Κατηγορία (Απάτη / Όχι)", x = "Class")

##Απλή Γραμμική Παλινδρόμηση

model1 <- lm(Amount ~ V1, data = df)
summary(model1)
## 
## Call:
## lm(formula = Amount ~ V1, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1147.0   -90.1   -39.6     8.0 24569.1 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  88.3496     0.4564   193.6   <2e-16 ***
## V1          -29.0778     0.2330  -124.8   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 243.5 on 284805 degrees of freedom
## Multiple R-squared:  0.05185,    Adjusted R-squared:  0.05185 
## F-statistic: 1.558e+04 on 1 and 284805 DF,  p-value: < 2.2e-16
SSE1 <- sum(model1$residuals^2)
SSE1
## [1] 16893624603

προσθήκη Μεταβλητών στο Μοντέλο

model2 <- lm(Amount ~ V1 + V2, data = df)
summary(model2)
## 
## Call:
## lm(formula = Amount ~ V1 + V2, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6379.7   -52.9   -19.0    20.2 22005.4 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  88.3496     0.3824   231.0   <2e-16 ***
## V1          -29.0778     0.1952  -148.9   <2e-16 ***
## V2          -80.4914     0.2316  -347.6   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 204.1 on 284804 degrees of freedom
## Multiple R-squared:  0.3342, Adjusted R-squared:  0.3342 
## F-statistic: 7.149e+04 on 2 and 284804 DF,  p-value: < 2.2e-16
SSE2 <- sum(model2$residuals^2)
SSE2
## [1] 11862048274
model3 <- lm(Amount ~ V1 + V2 + V3, data = df)
summary(model3)
## 
## Call:
## lm(formula = Amount ~ V1 + V2 + V3, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6609.5   -60.8   -13.1    32.3 20324.4 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  88.3496     0.3694   239.2   <2e-16 ***
## V1          -29.0778     0.1886  -154.2   <2e-16 ***
## V2          -80.4914     0.2237  -359.8   <2e-16 ***
## V3          -34.7867     0.2436  -142.8   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 197.1 on 284803 degrees of freedom
## Multiple R-squared:  0.3787, Adjusted R-squared:  0.3787 
## F-statistic: 5.787e+04 on 3 and 284803 DF,  p-value: < 2.2e-16
SSE3 <- sum(model3$residuals^2)
SSE3
## [1] 11069694562

##Σύγκριση Μοντέλων

glance(model1)
## # A tibble: 1 × 12
##   r.squared adj.r.squared sigma statistic p.value    df    logLik     AIC    BIC
##       <dbl>         <dbl> <dbl>     <dbl>   <dbl> <dbl>     <dbl>   <dbl>  <dbl>
## 1    0.0519        0.0518  244.    15575.       0     1 -1969229.  3.94e6 3.94e6
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
glance(model2)
## # A tibble: 1 × 12
##   r.squared adj.r.squared sigma statistic p.value    df    logLik     AIC    BIC
##       <dbl>         <dbl> <dbl>     <dbl>   <dbl> <dbl>     <dbl>   <dbl>  <dbl>
## 1     0.334         0.334  204.    71494.       0     2 -1918876.  3.84e6 3.84e6
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
glance(model3)
## # A tibble: 1 × 12
##   r.squared adj.r.squared sigma statistic p.value    df    logLik     AIC    BIC
##       <dbl>         <dbl> <dbl>     <dbl>   <dbl> <dbl>     <dbl>   <dbl>  <dbl>
## 1     0.379         0.379  197.    57869.       0     3 -1909031.  3.82e6 3.82e6
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
df_models <- tibble(
  Model = c("Model1", "Model2", "Model3"),
  R2 = c(summary(model1)$r.squared, summary(model2)$r.squared, summary(model3)$r.squared),
  SSE = c(SSE1, SSE2, SSE3)
)

df_models %>%
  pivot_longer(-Model) %>%
  ggplot(aes(x = Model, y = value, fill = name)) +
  geom_bar(stat="identity", position="dodge") +
  labs(title = "Σύγκριση R-squared και SSE ανά Μοντέλο", y = "")

##Συμπεράσματα