Exam

TASK 1

mydata4 <- read.csv("~/Bootcamp/Russia losses levels csv4.csv", header=TRUE, dec=".", sep= ";", stringsAsFactors=FALSE)

mydata4 <- mydata4[c(-9,-10,-13,-14,-15,-17,-18)]

This data shows Russian equipment losses in the Russia Ukraine war of 2022. Each row represents a day of battle. We do not take all the columns as some are relatively empty and we focus on 9 of them, the first two have to do with day and date, the others except for the last one a are numerical and represent numbers of equipemnt destroyed, one is a character column that shows directions in witch that day Russia lost most equipment. For the sake of finding out the mode I changed the cities to numbers.

mydata4$daily_aircraft <- diff(c( 0 ,mydata4$aircraft))
mydata4$daily_helicopter <- diff(c( 0 ,mydata4$helicopter))
mydata4$daily_tank <- diff(c( 0 ,mydata4$tank))
mydata4$daily_APC <- diff(c( 0 ,mydata4$APC))
mydata4$daily_field.artillery <- diff(c( 0 ,mydata4$field.artillery))
mydata4$daily_MRL <- diff(c( 0 ,mydata4$MRL))
mydata4$daily_drone <- diff(c( 0 ,mydata4$drone))
mydata4$daily_MRL <- diff(c( 0 ,mydata4$MRL))

mydata4 <- mydata4[c(-3,-4,-5,-6,-7,-8,-9,-10)]

In this segment data is transformed from accumulated to daily.

##install.packages("tidyverse")
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.2      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

mydata4 <- mydata4 %>% relocate (greatest.losses.direction, .after= daily_drone)

colnames(mydata4)[colnames(mydata4) == "daily_APC"] <- "daily_Armored personnel carriers"

mydata4 <- na.omit(mydata4)

Here a column is rellocated, a name is changed and we omit NA rows.

summary(mydata4[,c(-1,-2,-10)])

##  daily_aircraft   daily_helicopter   daily_tank   
##  Min.   :0.0000   Min.   :0.0000   Min.   : 0.00  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.: 4.00  
##  Median :0.0000   Median :0.0000   Median : 8.00  
##  Mean   :0.5325   Mean   :0.4481   Mean   : 9.26  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:13.75  
##  Max.   :9.0000   Max.   :4.0000   Max.   :35.00  
##  daily_Armored personnel carriers daily_field.artillery   daily_MRL     
##  Min.   : 0.00                    Min.   : 0.000        Min.   : 0.000  
##  1st Qu.: 8.00                    1st Qu.: 3.000        1st Qu.: 0.000  
##  Median :14.00                    Median : 5.000        Median : 0.000  
##  Mean   :17.02                    Mean   : 6.247        Mean   : 1.195  
##  3rd Qu.:25.00                    3rd Qu.: 8.750        3rd Qu.: 2.000  
##  Max.   :50.00                    Max.   :32.000        Max.   :10.000  
##   daily_drone    
##  Min.   : 0.000  
##  1st Qu.: 2.000  
##  Median : 4.000  
##  Mean   : 5.104  
##  3rd Qu.: 7.000  
##  Max.   :26.000

From the summary analysis we can see that average number of APC’s or Armored personnel carriers destroyed daily is 17.02.

From the data we can read that the mean for tanks destroyed daily is 9.26 witch means that on 50% of the days less tanks were destroyed and on the other 50% there were more than 9.26 tanks that were destroyed.

  my_mode <- function(x) {            
  unique_x <- unique(x)
  tabulate_x <- tabulate(match(x, unique_x))
  unique_x[tabulate_x == max(tabulate_x)]
}

my_mode(mydata4$greatest.losses.direction)

## [1] 39

From the mode function that we inserted we can see the mode is 39 with translates to Donetsk, witch means that the most common greatest loss direction was Donetsk, meaning that most days the Russians lost most equipment pursuing the direction of Donetsk.

mydata4$greatest.losses.directionLEVEL <- factor(mydata4$greatest.losses.direction,
                                                 
                                                                                                  levels = c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48),
                                                 
                                                 labels = c("Sievierodonetsk","Kurakhove and Izyum","Zaporizhzhia and Izyum","Izyum","Izyum; Novopavlivsk","Popasna","Lyman and Kurakhove","Slobozhanskyi","Novopavlivsk","Avdiivka","Novopavlivsk; Kurakhove and Sievierodonetsk","Kurakhove","Kurakhove and Avdiivka","Bakhmut and Zaporizhzhia","Lyman and Zaporizhzhia","Sloviansk; Kryvyi Rih and Zaporizhzhia","Bakhmut","Lyman","Avdiivka and Kryvyi Rih","Zaporizhzhia","Kryvyi Rih and Zaporizhzhia","Kryvyi Rih and Bakhmut","Sloviansk","Kharkiv and Bakhmut","Sievierodonetsk and Bakhmut","Bakhmut and Sievierodonetsk","Sloviansk; Bakhmut and Kryvyi Rih","Bakhmut and Avdiivka","Sloviansk; Bakhmut and Avdiivka","Sloviansk and Bakhmut","Bakhmut and Kurakhove","Kramatorsk and Bakhmut","Sloviansk and Donetsk","Avdiivka and Bakhmut","Kramatorsk","Mykolaiv","Kramatorsk; Kryvyi Rih and Bakhmut","Kryvyi Rih","Donetsk","Donetsk and Kryvyi Rih","Bakhmut and Kryvyi Rih","Bakhmut and Donetsk","Kharkiv and Donetsk","Donetsk and Mykolaiv","Donetsk and Kurakhove","Kryvyi Rih and Donetsk","Kryvyi Rih and Mykolaiv","Kramatorsk and Donetsk")
                                                 
                                                 )

After finding out the mode, we change the directions back to Character data for better readability sake.

mydata5 <- mydata4

##install.packages("car")
library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

## The following object is masked from 'package:purrr':
## 
##     some

scatterplot(mydata5$daily_tank, mydata5$`daily_Armored personnel carriers`,
            xlab = "Armored personel carriers",
            ylab = "Tanks",
            smooth=FALSE)

From the scatter plot we can see that there is correlation between tank and apc losses, witch shows that the Russian army sends out military groups that conssist of both. We can’t say witch gets destroyed more often as we have only the number of losses and not of the number sent out.

TASK 2

mydata6 <- read.table("~/Bootcamp/Body mass.csv", 
                     header=TRUE, 
                     sep=";", 
                     dec=",")

summary(mydata6[,2])

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   49.70   60.23   62.80   62.88   64.50   83.20

sd(mydata6[,2])

## [1] 6.011403

hist(mydata6$Mass,
     
     main="Distribution of the body mass",
     xlab = "Body mass(kg)",
     ylab = "Frequency",
     breaks= seq(from=40, to=90, by=1))

H0 : μ=59.5

t.test(mydata6$Mass,
       mu=59.5,
       alternative = "two.sided")

## 
##  One Sample t-test
## 
## data:  mydata6$Mass
## t = 3.9711, df = 49, p-value = 0.000234
## alternative hypothesis: true mean is not equal to 59.5
## 95 percent confidence interval:
##  61.16758 64.58442
## sample estimates:
## mean of x 
##    62.876

P value is very low, meaning that there is only 0.02 % chance that average body mass was the same in 2021/2022 than the one in 2018/2019. There fore we reject H0.

t <- 3.9711
r <- sqrt((t^2) / (t^2 + 49))
r

## [1] 0.4934295

R is 0.49 this classifies as medium effect.

TASK 3

Import the dataset Apartments.xlsx

library(readxl)
mydata7 <- read_xlsx("~/Bootcamp/Apartments.xlsx")

Description:

Age: Age of an apartment in years
Distance: The distance from city center in km
Price: Price per m2
Parking: 0-No, 1-Yes
Balcony: 0-No, 1-Yes

Change categorical variables into factors

mydata7$ParkingFactor <- factor(mydata7$Parking, 
                              levels = c(0, 1), 
                              labels = c("No", "Yes"))

mydata7$BalconyFactor <- factor(mydata7$Balcony, 
                              levels = c(0, 1), 
                              labels = c("No", "Yes"))

Test the hypothesis H0: Mu_Price = 1900 eur. What can you conclude?

t.test(mydata7$Price,
       mu = 1900,
       alternative = "two.sided")

## 
##  One Sample t-test
## 
## data:  mydata7$Price
## t = 2.9022, df = 84, p-value = 0.004731
## alternative hypothesis: true mean is not equal to 1900
## 95 percent confidence interval:
##  1937.443 2100.440
## sample estimates:
## mean of x 
##  2018.941

Estimate the simple regression function: Price = f(Age). Save results in object fit1 and explain the estimate of regression coefficient, coefficient of correlation and coefficient of determination.

fit1 <- lm(formula = Price ~ Age, data = mydata7)
summary(fit1)

## 
## Call:
## lm(formula = Price ~ Age, data = mydata7)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -623.9 -278.0  -69.8  243.5  776.1 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2185.455     87.043  25.108   <2e-16 ***
## Age           -8.975      4.164  -2.156    0.034 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 369.9 on 83 degrees of freedom
## Multiple R-squared:  0.05302,    Adjusted R-squared:  0.04161 
## F-statistic: 4.647 on 1 and 83 DF,  p-value: 0.03401

cor(mydata7$Price, mydata7$Age)

## [1] -0.230255

The estimate of regression is -8.975. This means that aevrage price per m2 decreases for 8.975 each year, if everything else remains unchanged.

Coefficient of correlation is -0.230255

Coefficient of determination or multiple R squared is 0.05302

Show the scateerplot matrix between Price, Age and Distance. Based on the matrix determine if there is potential problem with multicolinearity.

scatterplotMatrix(mydata7[ , c(3,1,2)], 
                  smooth = FALSE)

The distances between the variables and the lines are quite significant, therefore we presume multicolineraity is not present.

Estimate the multiple regression function: Price = f(Age, Distance). Save it in object named fit2.

fit2 <- lm(formula = Price ~ Age + Distance, data = mydata7)
fit2

## 
## Call:
## lm(formula = Price ~ Age + Distance, data = mydata7)
## 
## Coefficients:
## (Intercept)          Age     Distance  
##    2460.101       -7.934      -20.667

Chech the multicolinearity with VIF statistics. Explain the findings.

vif(fit2)

##      Age Distance 
## 1.001845 1.001845

mean(vif(fit2))

## [1] 1.001845

Considering the VIF is around 1 witch is < 5, we can say there is not multicolinearity.

Calculate standardized residuals and Cooks Distances for model fit2. Remove any potentially problematic case (outlier or unit with big influence).

mydata7$StdResid <- round(rstandard(fit2), 3) 
mydata7$CooksD <- round(cooks.distance(fit2), 3) 


hist(mydata7$StdResid, 
     xlab = "Standardized residuals", 
     ylab = "Frequency", 
     main = "Standardized residuals histogram",
     breaks = seq(from = -3, to = 3, by = 0.5)
     )

shapiro.test(mydata7$StdResid)

## 
##  Shapiro-Wilk normality test
## 
## data:  mydata7$StdResid
## W = 0.95303, p-value = 0.003645

## p-value = 0.003645 we reject H0

head(mydata7[order(mydata7$StdResid),],6)

## # A tibble: 6 × 9
##     Age Distance Price Parking Balcony ParkingFactor BalconyFac…¹ StdRe…² CooksD
##   <dbl>    <dbl> <dbl>   <dbl>   <dbl> <fct>         <fct>          <dbl>  <dbl>
## 1     7        2  1760       0       1 No            Yes            -2.15  0.066
## 2    12       14  1650       0       1 No            Yes            -1.50  0.013
## 3    12       14  1650       0       0 No            No             -1.50  0.013
## 4    13        8  1800       0       0 No            No             -1.38  0.012
## 5    14       16  1660       0       1 No            Yes            -1.26  0.008
## 6    24        5  1830       1       0 Yes           No             -1.19  0.012
## # … with abbreviated variable names ¹BalconyFactor, ²StdResid

Based on distribution of standardized residuals we exlude row 53

head(mydata7[order(-mydata7$CooksD),],6)

## # A tibble: 6 × 9
##     Age Distance Price Parking Balcony ParkingFactor BalconyFac…¹ StdRe…² CooksD
##   <dbl>    <dbl> <dbl>   <dbl>   <dbl> <fct>         <fct>          <dbl>  <dbl>
## 1     5       45  2180       1       1 Yes           Yes             2.58  0.32 
## 2    43       37  1740       0       0 No            No              1.44  0.104
## 3     2       11  2790       1       0 Yes           No              2.05  0.069
## 4     7        2  1760       0       1 No            Yes            -2.15  0.066
## 5    37        3  2540       1       1 Yes           Yes             1.58  0.061
## 6    40        2  2400       0       1 No            Yes             1.09  0.038
## # … with abbreviated variable names ¹BalconyFactor, ²StdResid

Based on distribution of Cooks Distance we exlude row 38

mydata7<-mydata7[c(-38,-53),]

hist(mydata7$StdResid,
     xlab = "Standardized residuals",
     ylab = "Frequency",
     main = "Standardized residuals histogram",
     breaks = seq(from = -3, to = 3, by = 0.5)
     )

hist(mydata7$CooksD,
     xlab = "Cooks distance",
     ylab = "Frequency",
     main = "Cooks distance histogram")

We can see that the problematic values are gone.

fit2 <- lm(Price ~ Age + Distance, data = mydata7)
summary(fit2)

## 
## Call:
## lm(formula = Price ~ Age + Distance, data = mydata7)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -420.51 -223.89  -62.78  202.78  575.08 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2489.617     73.524  33.861  < 2e-16 ***
## Age           -7.350      3.103  -2.368   0.0203 *  
## Distance     -23.636      2.731  -8.654 4.21e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 269.1 on 80 degrees of freedom
## Multiple R-squared:  0.513,  Adjusted R-squared:  0.5008 
## F-statistic: 42.13 on 2 and 80 DF,  p-value: 3.177e-13

Check for potential heteroskedasticity with scatterplot between standarized residuals and standrdized fitted values. Explain the findings.

mydata7$StdfittedValues <- scale(fit2$fitted.values)

library(car)
scatterplot(y = mydata7$StdResid, x= mydata7$StdfittedValues,
            ylab = "Standarized residuals",
            xlab = "Standarized fitted values",
            boxplots = FALSE, 
            regLine = FALSE, 
            smooth = FALSE)

There are no shapes in the dispersion therefore we can not determine heteroskedasticity.

Are standardized residuals ditributed normally? Show the graph and formally test it. Explain the findings.

hist(mydata7$StdResid,
     xlab="Standardized residuals",
    ylab="Frequency",
    main="Standardized residuals histogram")

No standardized residuals are bigger than three this shows normal distribution.

Estimate the fit2 again without potentially excluded cases and show the summary of the model. Explain all coefficients.

shapiro.test(mydata7$StdResid)

## 
##  Shapiro-Wilk normality test
## 
## data:  mydata7$StdResid
## W = 0.93368, p-value = 0.0003444

P value is very low therefore we reject H0 and conclude that standard residuals are not normally distributed.

Estimate the linear regression function Price = f(Age, Distance, Parking and Balcony). Be careful to correctly include categorical variables. Save the object named fit3.

fit3 <- lm(formula =Price ~ Age + Distance + ParkingFactor + BalconyFactor,
           data = mydata7)
fit3

## 
## Call:
## lm(formula = Price ~ Age + Distance + ParkingFactor + BalconyFactor, 
##     data = mydata7)
## 
## Coefficients:
##      (Intercept)               Age          Distance  ParkingFactorYes  
##         2367.282            -6.605           -21.140           147.508  
## BalconyFactorYes  
##           -3.122

With function anova check if model fit3 fits data better than model fit2.

anova(fit2,fit3)

## Analysis of Variance Table
## 
## Model 1: Price ~ Age + Distance
## Model 2: Price ~ Age + Distance + ParkingFactor + BalconyFactor
##   Res.Df     RSS Df Sum of Sq      F  Pr(>F)  
## 1     80 5795128                              
## 2     78 5410469  2    384659 2.7727 0.06866 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Show the results of fit3 and explain regression coefficient for both categorical variables. Can you write down the hypothesis which is being tested with F-statistics, shown at the bottom of the output?

summary(fit3)

## 
## Call:
## lm(formula = Price ~ Age + Distance + ParkingFactor + BalconyFactor, 
##     data = mydata7)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -420.58 -198.68  -44.44  229.33  529.90 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      2367.282     93.781  25.243  < 2e-16 ***
## Age                -6.605      3.054  -2.162   0.0337 *  
## Distance          -21.140      2.878  -7.345 1.73e-10 ***
## ParkingFactorYes  147.508     62.799   2.349   0.0214 *  
## BalconyFactorYes   -3.122     58.635  -0.053   0.9577    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 263.4 on 78 degrees of freedom
## Multiple R-squared:  0.5453, Adjusted R-squared:  0.522 
## F-statistic: 23.39 on 4 and 78 DF,  p-value: 9.972e-13

F-statistics: H0: p squared = 0 H1: p squared > 0 p-value<0.05 therefore we reject H0. Atleast one of the variables is different from 0 and has effect on dependent variable.

Save fitted values and claculate the residual for apartment ID2.

mydata7$FittedValues <- fitted.values(fit2)
mydata7$Residuals <- residuals(fit3)

head(mydata7[, colnames(mydata7) %in% c("Apartment", "Residuals")])

## # A tibble: 6 × 1
##   Residuals
##       <dbl>
## 1     -86.0
## 2     425. 
## 3     -69.1
## 4     284. 
## 5    -372. 
## 6    -156.