#PACKAGES AND LIBRARIES
#install.packages("psych")
library(psych)
#install.packages("pastecs")
library(pastecs)
#install.packages("tidyr")
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:pastecs':
## 
##     extract
#install.packages(car)
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
#install.packages("ggplot2")
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
#install.packages("readxl)
library(readxl)
#install.packages("Hmisc")
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following object is masked from 'package:psych':
## 
##     describe
## The following objects are masked from 'package:base':
## 
##     format.pval, units

TASK 1 - OWN DATA SET

Importing of selected own data set.

owndata <- read.table("~/Desktop/IMB/BOOTCAMP/BOOTCAMP R/Car_sales_original.csv", 
                     header = TRUE, 
                     sep = ",", 
                     dec = ".")

Description and overview of the selected data set.

head(owndata,10)
##    Manufacturer   Model Sales_in_thousands X__year_resale_value Vehicle_type Price_in_thousands
## 1         Acura Integra             16.919               16.360    Passenger              21.50
## 2         Acura      TL             39.384               19.875    Passenger              28.40
## 3         Acura      CL             14.114               18.225    Passenger                 NA
## 4         Acura      RL              8.588               29.725    Passenger              42.00
## 5          Audi      A4             20.397               22.255    Passenger              23.99
## 6          Audi      A6             18.780               23.555    Passenger              33.95
## 7          Audi      A8              1.380               39.000    Passenger              62.00
## 8           BMW    323i             19.747                   NA    Passenger              26.99
## 9           BMW    328i              9.231               28.675    Passenger              33.40
## 10          BMW    528i             17.527               36.125    Passenger              38.90
##    Engine_size Horsepower Wheelbase Width Length Curb_weight Fuel_capacity Fuel_efficiency
## 1          1.8        140     101.2  67.3  172.4       2.639          13.2              28
## 2          3.2        225     108.1  70.3  192.9       3.517          17.2              25
## 3          3.2        225     106.9  70.6  192.0       3.470          17.2              26
## 4          3.5        210     114.6  71.4  196.6       3.850          18.0              22
## 5          1.8        150     102.6  68.2  178.0       2.998          16.4              27
## 6          2.8        200     108.7  76.1  192.0       3.561          18.5              22
## 7          4.2        310     113.0  74.0  198.2       3.902          23.7              21
## 8          2.5        170     107.3  68.4  176.0       3.179          16.6              26
## 9          2.8        193     107.3  68.5  176.0       3.197          16.6              24
## 10         2.8        193     111.4  70.9  188.0       3.472          18.5              25
##    Latest_Launch Power_perf_factor Country_of_origin
## 1       2/2/2012          58.28015                 1
## 2       6/3/2011          91.37078                 1
## 3       1/4/2012                NA                 1
## 4      3/10/2011          91.38978                 1
## 5      10/8/2011          62.77764                 1
## 6       8/9/2011          84.56511                 1
## 7      2/27/2012         134.65686                 1
## 8      6/28/2011          71.19121                 1
## 9      1/29/2012          81.87707                 1
## 10      4/4/2011          83.99872                 1

Description of variables:

  • Manufacturer: Manufacturer of vehicle

  • Model: Model of vehicle

  • Sales in thousands: Units of specific manufacturer and model of vehicle in ’000.

  • X year resale value: The resale value of a vehicle in year x in ’000.

  • Vehicle type: Type of vehicle is either a passenger vehicle or car. No need to differentiate for our research.

  • Price in thousands: Original price of vehicle for specific manufacturer and model of vehicle in ’000.

  • Engine size: Engine size/capacity in liters.

  • Horsepower: Horsepower of vehicle.

  • Wheelbase: Length of vehicle wheelbase in inches.

  • Width: Width of vehicle in inches.

  • Length: Length of vehicle in inches.

  • Curb weight: Weight of road ready vehicle in lbs.

  • Fuel capacity: Capacity of vehicles fuel tank in gallons.

  • Fuel efficiency: Fuel consumption of vehicles in miles per gallon.

  • Latest launch: Date of latest model variation being sold on market for the first time.

  • Power performance factor: Factor assigned to vehicle based on its performance, weight, fuel consumption, etc.

  • Country of origin: 1 = Non-US car manufacturer, 0 = US car manufacturer.

The selected data set is comprised of 157 observations. The observations were assessed on 15 different variables, of which there are 13 numerical variables with the last (“Country_of_origin”) being an interval variable (0 or 1) to describe where the vehicle manufacturer originates. We are only going to be focusing on the sales volumes, original prices of the vehicles, the horsepower of the vehicles, fuel efficiency and the country of origin variables. The first step is therefore to remove all of the remaining “unnecessary” variables from our data set and create a new one.

Creating a new data set with only selected variables.

owndata1 <- (owndata[ ,c(1,2,3,6,8,14,17)])
head(owndata1,10)
##    Manufacturer   Model Sales_in_thousands Price_in_thousands Horsepower Fuel_efficiency
## 1         Acura Integra             16.919              21.50        140              28
## 2         Acura      TL             39.384              28.40        225              25
## 3         Acura      CL             14.114                 NA        225              26
## 4         Acura      RL              8.588              42.00        210              22
## 5          Audi      A4             20.397              23.99        150              27
## 6          Audi      A6             18.780              33.95        200              22
## 7          Audi      A8              1.380              62.00        310              21
## 8           BMW    323i             19.747              26.99        170              26
## 9           BMW    328i              9.231              33.40        193              24
## 10          BMW    528i             17.527              38.90        193              25
##    Country_of_origin
## 1                  1
## 2                  1
## 3                  1
## 4                  1
## 5                  1
## 6                  1
## 7                  1
## 8                  1
## 9                  1
## 10                 1

Removing all NA values from our dataset.

owndata2 <- drop_na(owndata1)
head(owndata2,10)
##    Manufacturer   Model Sales_in_thousands Price_in_thousands Horsepower Fuel_efficiency
## 1         Acura Integra             16.919             21.500        140              28
## 2         Acura      TL             39.384             28.400        225              25
## 3         Acura      RL              8.588             42.000        210              22
## 4          Audi      A4             20.397             23.990        150              27
## 5          Audi      A6             18.780             33.950        200              22
## 6          Audi      A8              1.380             62.000        310              21
## 7           BMW    323i             19.747             26.990        170              26
## 8           BMW    328i              9.231             33.400        193              24
## 9           BMW    528i             17.527             38.900        193              25
## 10        Buick Century             91.561             21.975        175              25
##    Country_of_origin
## 1                  1
## 2                  1
## 3                  1
## 4                  1
## 5                  1
## 6                  1
## 7                  1
## 8                  1
## 9                  1
## 10                 0

From the new data set we can see that there were 4 observations that returned a NA value, which have now been omitted from our observations and analysis. From the head function of owndata2 we can see that row 3 has been removed.

Renaming columns for better readability and understanding.

colnames(owndata2) <- c("Manufacturer", "Model", "Sales in '000 units", "Original Price in '000 in USD", "Horsepower", "Fuel efficiency", "Country of origin")
head(owndata2,10)
##    Manufacturer   Model Sales in '000 units Original Price in '000 in USD Horsepower
## 1         Acura Integra              16.919                        21.500        140
## 2         Acura      TL              39.384                        28.400        225
## 3         Acura      RL               8.588                        42.000        210
## 4          Audi      A4              20.397                        23.990        150
## 5          Audi      A6              18.780                        33.950        200
## 6          Audi      A8               1.380                        62.000        310
## 7           BMW    323i              19.747                        26.990        170
## 8           BMW    328i               9.231                        33.400        193
## 9           BMW    528i              17.527                        38.900        193
## 10        Buick Century              91.561                        21.975        175
##    Fuel efficiency Country of origin
## 1               28                 1
## 2               25                 1
## 3               22                 1
## 4               27                 1
## 5               22                 1
## 6               21                 1
## 7               26                 1
## 8               24                 1
## 9               25                 1
## 10              25                 0

Adding the country of origin factor to our data set.

owndata2$CountryF <- factor(owndata2$`Country of origin`,
                           levels = c(0,1),
                           labels = c("US","Non-US"))
head(owndata2,10)
##    Manufacturer   Model Sales in '000 units Original Price in '000 in USD Horsepower
## 1         Acura Integra              16.919                        21.500        140
## 2         Acura      TL              39.384                        28.400        225
## 3         Acura      RL               8.588                        42.000        210
## 4          Audi      A4              20.397                        23.990        150
## 5          Audi      A6              18.780                        33.950        200
## 6          Audi      A8               1.380                        62.000        310
## 7           BMW    323i              19.747                        26.990        170
## 8           BMW    328i               9.231                        33.400        193
## 9           BMW    528i              17.527                        38.900        193
## 10        Buick Century              91.561                        21.975        175
##    Fuel efficiency Country of origin CountryF
## 1               28                 1   Non-US
## 2               25                 1   Non-US
## 3               22                 1   Non-US
## 4               27                 1   Non-US
## 5               22                 1   Non-US
## 6               21                 1   Non-US
## 7               26                 1   Non-US
## 8               24                 1   Non-US
## 9               25                 1   Non-US
## 10              25                 0       US

Creating two separate statistical summaries and specific factors of data that combined give the most information about the selected data set.

summary(owndata2[ ,c(-1,-2,-7)])
##  Sales in '000 units Original Price in '000 in USD   Horsepower    Fuel efficiency   CountryF 
##  Min.   :  0.11      Min.   : 9.235                Min.   : 55.0   Min.   :15.00   US    :77  
##  1st Qu.: 13.80      1st Qu.:17.890                1st Qu.:148.0   1st Qu.:21.00   Non-US:76  
##  Median : 28.98      Median :22.799                Median :175.0   Median :24.00              
##  Mean   : 53.11      Mean   :27.444                Mean   :185.4   Mean   :23.83              
##  3rd Qu.: 67.96      3rd Qu.:31.965                3rd Qu.:215.0   3rd Qu.:26.00              
##  Max.   :540.56      Max.   :85.500                Max.   :450.0   Max.   :45.00
round(stat.desc(owndata2[ ,c(-1,-2,-7,-8)]),3)
##              Sales in '000 units Original Price in '000 in USD Horsepower Fuel efficiency
## nbr.val                  153.000                       153.000    153.000         153.000
## nbr.null                   0.000                         0.000      0.000           0.000
## nbr.na                     0.000                         0.000      0.000           0.000
## min                        0.110                         9.235     55.000          15.000
## max                      540.561                        85.500    450.000          45.000
## range                    540.451                        76.265    395.000          30.000
## sum                     8126.522                      4198.912  28366.000        3646.000
## median                    28.976                        22.799    175.000          24.000
## mean                      53.115                        27.444    185.399          23.830
## SE.mean                    5.560                         1.167      4.617           0.347
## CI.mean.0.95              10.986                         2.306      9.121           0.686
## var                     4730.384                       208.451   3260.794          18.431
## std.dev                   68.778                        14.438     57.103           4.293
## coef.var                   1.295                         0.526      0.308           0.180

From the two statistical summaries we can see, that there are 153 units of observation, of which 74 are US manufacturer vehicles and 79 Non-US manufacturer vehicles. From the above tables we can also read that the range of vehicle prices was 76,265 USD. Meaning the most expensive vehicle was 76,265 USD more expensive than the cheapest. From the observed data we can state that the largest variation was in the number of sales of each vehicle model, the coefficient of variation being 1,295 or 129,5%.

sapply(owndata2[ ,c(4,5)], FUN = mean)
## Original Price in '000 in USD                    Horsepower 
##                      27.44387                     185.39869

With the help of the sapply function we can calculate the average price of a new vehicle, which was 27,443.87 USD and the average horsepower of a new vehicle which was 185.39 horsepower, in one step.

median(owndata2$`Original Price in '000 in USD`)
## [1] 22.799

The median price of a new vehicle was 22,799 USD, meaning that 50% of the new vehicles bought cost 22,799 USD or less and the other 50% cost more than 22,799 USD.

Distribution of our sample data based on Original price in USD ’000.

ggplot(owndata2, aes(x=`Original Price in '000 in USD`)) + 
  geom_histogram(binwidth = 5, colour="black", fill="white") + 
  ylab("Frequency")

As can be seen from the histogram above, the distribution is not normal since, it is skewed/asymmetrical to the right.

Using scatterplot function to create a scatter plot showing the relationship between horsepower and fuel efficiency of a vehicle.

scatterplot(y=owndata2$`Fuel efficiency`, x=owndata2$Horsepower,
        main = "Fuel efficiency in relation to horsepower",
        ylab = "Fuel efficiency in MPG",
        xlab = "Horsepower",
        smooth = FALSE)

Without any further testing of hypothesis we could, from the scatter plot graph above, predict that there is a negative relationship between fuel efficiency and horsepower of a vehicle. From the scatter plot and box plot combination we can observe 2 outliers in the horsepower variable and 1 outlier in the fuel efficiency variable that could potentially be removed to further improve the model if we were to pursue the testing of this hypothesis.

Scatter plot matrix of own selected data.

scatterplotMatrix(owndata2[ ,c(-1,-2,-3,-7,-8)],
                  smooth = FALSE)

With the above scatter plot matrix we can see that all of the distributions from the selected sample data set are skewed to the right, with outliers on the right hand sides of the distributions. The original price of a vehicle in ’000 USD is positively correlated to the horsepower a vehicle has, so a vehicle with more horsepower will cost more when bought new. As I have already shown, there is a negative correlation between horsepower and fuel efficiency, a vehicle with more horsepower will have a higher fuel consumption.

Using a boxplot graph to create a box plot showing the difference between prices of US manufacturer vehicles and Non-US manufacturer vehicles.

ggplot(owndata2,aes(
  x=`CountryF`,
  y=`Original Price in '000 in USD`)) + 
  geom_boxplot() + 
  ylab("Original Price in '000 USD") + 
  xlab("Country of origin")

The box plot above shows how the prices of US manufacturer vehicles compare to those of Non-US manufacturer vehicles. We can see from the graph above, that US made vehicles are in general cheaper than its Non-US counterparts.

TASK 2 - BODY MASS DATA SET

Importing the body mass data set for task 2.

bodymassdata <- read.table("~/Desktop/IMB/BOOTCAMP/R TAKE HOME EXAM/Task 2/Body mass.csv", 
                     header = TRUE, 
                     sep = ";", 
                     dec = ",")
head(bodymassdata,10)
##    ID Mass
## 1   1 62.1
## 2   2 64.5
## 3   3 56.5
## 4   4 53.4
## 5   5 61.3
## 6   6 62.2
## 7   7 62.7
## 8   8 64.5
## 9   9 59.5
## 10 10 68.9

Descriptive statistics of body mass of 9th graders.

round(stat.desc(bodymassdata),2)
##                   ID    Mass
## nbr.val        50.00   50.00
## nbr.null        0.00    0.00
## nbr.na          0.00    0.00
## min             1.00   49.70
## max            50.00   83.20
## range          49.00   33.50
## sum          1275.00 3143.80
## median         25.50   62.80
## mean           25.50   62.88
## SE.mean         2.06    0.85
## CI.mean.0.95    4.14    1.71
## var           212.50   36.14
## std.dev        14.58    6.01
## coef.var        0.57    0.10

From the descriptive statistics we can observe that there were 50 students tested with a mean weight of 62.88kg and a median of 62.80kg. The range of the mass variable, meaning the difference between the lightest and heaviest student, is 33.5kg. The coefficient of variation shows that the variation in the sample estimate is 10%.

Histogram of body mass of 9th graders in kg.

hist(bodymassdata$Mass,
     main = "Body mass of 9th graders", 
     ylab = "Frequency",
     xlab = "Weight in kg",
     breaks = seq(20, 100, 5))

Using the x-axis frequency from 20 to 100 kg allows us to capture all values, while still keeping the logic that no one can weigh 0kg. We created the bin breaks at 5kg, to keep the distribution continuous throughout the histogram. We can observe that the distribution is going towards normal with the highest frequency at around 65 kg.

Hypothesis testing H0: 𝜇= 59.5 kg

H1: 𝜇≠ 59.5 kg

t.test(bodymassdata$Mass,
       mu = 59.5,
       alternative = "two.sided",
       conf.level = 0.95)
## 
##  One Sample t-test
## 
## data:  bodymassdata$Mass
## t = 3.9711, df = 49, p-value = 0.000234
## alternative hypothesis: true mean is not equal to 59.5
## 95 percent confidence interval:
##  61.16758 64.58442
## sample estimates:
## mean of x 
##    62.876

With the two sided t.test we have proven at the selected 95% confidence interval that the average body mass of students in 9th grade for year 21/22 was different/not equal than that of the 9th graders generation in 18/19 at p-value < 0.001.

Determinining the effect size.

sqrt(3.9711^2/(3.9711^2+49))
## [1] 0.4934295

The calculated value of approximately 0,49 suggests that there is a medium to high effect size on the mass of 9th graders between the school years 18/19 and 21/22.

TASK 3 - APARTMENTS

Import the dataset Apartments.xlsx

apartmentdata <- read_xlsx("~/Desktop/IMB/BOOTCAMP/R TAKE HOME EXAM/Task 3/Apartments.xlsx")

Description:

  • Age: Age of an apartment in years

  • Distance: The distance from city center in km

  • Price: Price per m2

  • Parking: 0-No, 1-Yes

  • Balcony: 0-No, 1-Yes

Change categorical variables into factors

apartmentdata$ParkingFactor <- factor(apartmentdata$Parking,
                                      levels = c(0,1),
                                      labels = c("No","Yes"))

apartmentdata$BalconyFactor <- factor(apartmentdata$Balcony,
                                      levels = c(0,1),
                                      labels = c("No","Yes"))

Test the hypothesis H0: Mu_Price = 1900 eur. What can you conclude?

t.test(apartmentdata$Price,
       mu = 1900,
       alternative = "two.sided",
       conf.level = 0.95)
## 
##  One Sample t-test
## 
## data:  apartmentdata$Price
## t = 2.9022, df = 84, p-value = 0.004731
## alternative hypothesis: true mean is not equal to 1900
## 95 percent confidence interval:
##  1937.443 2100.440
## sample estimates:
## mean of x 
##  2018.941

With the two sided t.test we have proven at the selected 95% confidence interval that the average price for a square meter of an apartment is different from the mean from the null hypothesis (1900 eur) at p-value < 0.005. We can state with 95% confidence that the mean price for a square meter is between 1937.4 eur and 2100.4 eur.

Estimate the simple regression function: Price = f(Age). Save results in object fit1 and explain the estimate of regression coefficient, coefficient of correlation and coefficient of determination.

fit1 <- lm(Price ~ Age,
           data = apartmentdata)
summary(fit1)
## 
## Call:
## lm(formula = Price ~ Age, data = apartmentdata)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -623.9 -278.0  -69.8  243.5  776.1 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2185.455     87.043  25.108   <2e-16 ***
## Age           -8.975      4.164  -2.156    0.034 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 369.9 on 83 degrees of freedom
## Multiple R-squared:  0.05302,    Adjusted R-squared:  0.04161 
## F-statistic: 4.647 on 1 and 83 DF,  p-value: 0.03401

The simple regression function of Price = f(Age) = 2185.46 - 8.975(Age). The estimates of the partial regression coefficient b0 tells us that when the age of the apartment is 0 the average price for a squared meter is 2185.46 eur. Estimate of b1 correlation coefficient states, that with every year that the apartment increases by 1 unit (increase in age of an apartment by 1), the average price of the square meter of an apartment decreases by 8.975 eur. Coefficient of determination (R2) is 0.05302, meaning that the age of apartments (independent variable) explains approximately 5.3% of the variation of price (dependent variable) in this data, this is quite a low R2.

sqrt(summary(fit1)$r.squared)
## [1] 0.230255
rcorr(as.matrix(apartmentdata[ ,c(1,3)]))
##         Age Price
## Age    1.00 -0.23
## Price -0.23  1.00
## 
## n= 85 
## 
## 
## P
##       Age   Price
## Age         0.034
## Price 0.034

The calculated R aka. the Pearson correlation coefficient, calculated as the square root of R2 given by the summary of the regression model or with the function rcorr, depicts a weak and negative linear (-0.23) relationship between the price of the apartment and the age of the apartment.

Show the scateerplot matrix between Price, Age and Distance. Based on the matrix determine if there is potential problem with multicolinearity.

scatterplotMatrix(apartmentdata[ ,c(3,1,2)],
                  smooth = FALSE)

The above scatterplot matrix by itself does not show signs of multicolinearity. To make sure we would have to be calculate the correlations between variables using the vif function.

Estimate the multiple regression function: Price = f(Age, Distance). Save it in object named fit2.

fit2 <- lm(Price ~ Age + Distance,
           data = apartmentdata)
summary(fit2)
## 
## Call:
## lm(formula = Price ~ Age + Distance, data = apartmentdata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -603.23 -219.94  -85.68  211.31  689.58 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2460.101     76.632   32.10  < 2e-16 ***
## Age           -7.934      3.225   -2.46    0.016 *  
## Distance     -20.667      2.748   -7.52 6.18e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 286.3 on 82 degrees of freedom
## Multiple R-squared:  0.4396, Adjusted R-squared:  0.4259 
## F-statistic: 32.16 on 2 and 82 DF,  p-value: 4.896e-11

Check the multicolinearity with VIF statistics. Explain the findings.

vif(fit2)
##      Age Distance 
## 1.001845 1.001845

With the VIF function we can check that there is not a strong relationship between the two variables, that would suggest multicolinearity. Since, they are both extremely close to 1 we can conclude that we do not have a problem with multicolinearity.

Calculate standardized residuals and Cooks Distances for model fit2. Remove any potentially problematic case (outlier or unit with big influence).

apartmentdata$StdResid <- round(rstandard(fit2), 4)
apartmentdata$CooksD <- round(cooks.distance(fit2), 4)

We first calculated the values of standardized residuals and the values of cooks distance for the apartment data set and added them as new variables to the original data set.

hist(apartmentdata$StdResid,
     xlab = "Standardized residual values",
     ylab = "Frequency",
     main = "Histogram of standardized residual values")

hist(apartmentdata$CooksD,
     xlab = "Cooks distance values",
     ylab = "Frequency",
     main = "Histogram of Cooks distances")

Then we graphed them into histograms to check for outliers in standardized residual values and a big difference in Cooks distance values. When we graphed both of these values we can see that there is one outlier in standardized residuals (on histogram of standardized residuals below value -2) and one value with a large influence in the Cooks distances (on histogram of Cooks distances value between 0.30 and 0.35).

head(apartmentdata[order(apartmentdata$StdResid),],10)
## # A tibble: 10 × 9
##      Age Distance Price Parking Balcony ParkingFactor BalconyFactor StdResid CooksD
##    <dbl>    <dbl> <dbl>   <dbl>   <dbl> <fct>         <fct>            <dbl>  <dbl>
##  1     7        2  1760       0       1 No            Yes              -2.15 0.0663
##  2    12       14  1650       0       1 No            Yes              -1.50 0.0131
##  3    12       14  1650       0       0 No            No               -1.50 0.0131
##  4    13        8  1800       0       0 No            No               -1.38 0.0123
##  5    14       16  1660       0       1 No            Yes              -1.26 0.0079
##  6    24        5  1830       1       0 Yes           No               -1.19 0.0115
##  7    30       17  1560       0       0 No            No               -1.10 0.012 
##  8    18       18  1640       1       1 Yes           Yes              -1.07 0.0051
##  9    18       18  1640       1       1 Yes           Yes              -1.07 0.0051
## 10    18       19  1620       1       0 Yes           No               -1.07 0.0054
head(apartmentdata[order(-apartmentdata$CooksD),],10)
## # A tibble: 10 × 9
##      Age Distance Price Parking Balcony ParkingFactor BalconyFactor StdResid CooksD
##    <dbl>    <dbl> <dbl>   <dbl>   <dbl> <fct>         <fct>            <dbl>  <dbl>
##  1     5       45  2180       1       1 Yes           Yes               2.58 0.320 
##  2    43       37  1740       0       0 No            No                1.44 0.104 
##  3     2       11  2790       1       0 Yes           No                2.05 0.0691
##  4     7        2  1760       0       1 No            Yes              -2.15 0.0663
##  5    37        3  2540       1       1 Yes           Yes               1.58 0.0609
##  6    40        2  2400       0       1 No            Yes               1.09 0.0375
##  7     8        2  2820       1       0 Yes           No                1.66 0.0365
##  8     8       26  2300       1       1 Yes           Yes               1.57 0.0341
##  9    10        1  2810       0       0 No            No                1.60 0.032 
## 10    18        1  2800       1       0 Yes           No                1.78 0.0304
apartmentdata1 <- apartmentdata[c(-38,-53),]

From the original data we now remove both units of observation that were identified as an outlier and as a unit with a large influence. We removed row number 53, since the standard residual value was a lot lower than the standard residual of other observations. We also remove row 38, due to the fact that its Cooks distance is approximately 3 times larger than the next following value.

Check for potential heteroskedasticity with scatterplot between standarized residuals and standrdized fitted values. Explain the findings.

scatterplot(y = fit2$residuals, x = fit2$fitted.values,
            ylab = "Standardized residuals",
            xlab = "Standardized fitted values",
            boxplots = FALSE,
            smooth = FALSE)

From the graph we can assume that there is no violation of the distribution of variability, and the variance is constant. So we can reject heteroskedasticity and confirm homoskedasticity. To be sure we can also run the Breusch–Pagan heteroskedasticity test.

Are standardized residuals ditributed normally? Show the graph and formally test it. Explain the findings.

hist(apartmentdata1$StdResid,
     xlab = "Standarized Residuals",
     ylab = "Frequency",
     main = "Histogram of standarized residuals")

From the above plotted histogram we can conclude, that while the distribution of standardized residuals is nearing a normal distribution it is not normally distributed for this sample size, even with the exclusion of outliers and large influence values. We can see it is skewed to the right.

For the Shapiro-Wilk normality test we have the hypothesis H0: Variable is normally distributed. H1: Variable is not normally distributed.

shapiro.test(apartmentdata1$StdResid)
## 
##  Shapiro-Wilk normality test
## 
## data:  apartmentdata1$StdResid
## W = 0.93372, p-value = 0.0003461

From the result we can reject the null hypothesis at p < 0.0005 and thus confirm our assumption from above that the variable is not normally distributed.

Estimate the fit2 again without potentially excluded cases and show the summary of the model. Explain all coefficients.

fit2 <- lm(Price ~ Age + Distance, 
           data=apartmentdata)
summary(fit2)
## 
## Call:
## lm(formula = Price ~ Age + Distance, data = apartmentdata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -603.23 -219.94  -85.68  211.31  689.58 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2460.101     76.632   32.10  < 2e-16 ***
## Age           -7.934      3.225   -2.46    0.016 *  
## Distance     -20.667      2.748   -7.52 6.18e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 286.3 on 82 degrees of freedom
## Multiple R-squared:  0.4396, Adjusted R-squared:  0.4259 
## F-statistic: 32.16 on 2 and 82 DF,  p-value: 4.896e-11

From this fit we can then explain all of the coefficients. Since both p-values of age and distance are lower than the critical value of 0.05, we can for both estimates reject the null hypothesis and state that they do have a significant effect on the price per square meter of an apartment. From the estimate values we can see that if age of the apartment is increased by 1 year, then the price on average decreases by 7.934 euro per square meter, holding all other variables constant. If distance is increased by 1 kilometer, then price on average decreases by 20.667 euros, holding all other variables constant. The adjusted R2 (coefficient of determination) tells us that 42.6% of the variability of the dependent variable (price) can be explained by the included independent variables (age and distance).

*Since we are using the exact formula for both R doesnnt allow for there to be a different value from original task to this one.

Estimate the linear regression function Price = f(Age, Distance, Parking and Balcony). Be careful to correctly include categorical variables. Save the object named fit3.

fit3 <- lm(Price ~ Age + Distance + ParkingFactor + BalconyFactor,
           data = apartmentdata)

With function anova check if model fit3 fits data better than model fit2.

anova(fit2,fit3)
## Analysis of Variance Table
## 
## Model 1: Price ~ Age + Distance
## Model 2: Price ~ Age + Distance + ParkingFactor + BalconyFactor
##   Res.Df     RSS Df Sum of Sq      F  Pr(>F)  
## 1     82 6720983                              
## 2     80 5991088  2    729894 4.8732 0.01007 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Our hypothesis are as follows: H0: Fit2 (model 1) is more appropriate. H1: Fit3 (model 2) is more appropriate.

Based on the p value which is approximately 0.01 (1%) we can reject the null hypothesis and assume that fit3 is more appropriate and fits data better than fit 2.

Show the results of fit3 and explain regression coefficient for both categorical variables. Can you write down the hypothesis which is being tested with F-statistics, shown at the bottom of the output?

summary(fit3)
## 
## Call:
## lm(formula = Price ~ Age + Distance + ParkingFactor + BalconyFactor, 
##     data = apartmentdata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -459.92 -200.66  -57.48  260.08  594.37 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      2301.667     94.271  24.415  < 2e-16 ***
## Age                -6.799      3.110  -2.186  0.03172 *  
## Distance          -18.045      2.758  -6.543 5.28e-09 ***
## ParkingFactorYes  196.168     62.868   3.120  0.00251 ** 
## BalconyFactorYes    1.935     60.014   0.032  0.97436    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 273.7 on 80 degrees of freedom
## Multiple R-squared:  0.5004, Adjusted R-squared:  0.4754 
## F-statistic: 20.03 on 4 and 80 DF,  p-value: 1.849e-11

The hypothesis being tested for the F-statistics output: H0: ρ2 = 0 H1: ρ2 > 0

Save fitted values and calculate the residual for apartment ID2.

apartmentdata$Fittedvalues <- fitted.values(fit3)

apartmentdata$StdResid <- residuals(fit3)
head(apartmentdata,10)
## # A tibble: 10 × 10
##      Age Distance Price Parking Balcony ParkingFactor BalconyFactor StdResid CooksD Fittedvalues
##    <dbl>    <dbl> <dbl>   <dbl>   <dbl> <fct>         <fct>            <dbl>  <dbl>        <dbl>
##  1     7       28  1640       0       1 No            Yes            -111.   0.0074        1751.
##  2    18        1  2800       1       0 Yes           No              443.   0.0304        2357.
##  3     7       28  1660       0       0 No            No              -88.8  0.0059        1749.
##  4    28       29  1850       0       1 No            Yes             260.   0.0083        1590.
##  5    18       18  1640       1       1 Yes           Yes            -413.   0.0051        2053.
##  6    28       12  1770       0       1 No            Yes            -127.   0.0049        1897.
##  7    14       20  1850       0       1 No            Yes               2.49 0.0005        1848.
##  8    18        6  1970       1       1 Yes           Yes            -299.   0.0038        2269.
##  9    22        7  2270       1       0 Yes           No               48.1  0.0013        2222.
## 10    25        2  2570       1       0 Yes           No              278.   0.0167        2292.

From the head function of the data set apartmentdata, we can see that the standardized residual for apartment number 2 is 442.58.