covid <- read.table("./covid_19.csv",
header=TRUE,
sep=",",
dec=",")
head(covid)
## country continent population day
## 1 Saint-Helena Africa 6115.0 2024-06-30
## 2 Falkland-Islands South-America 3539.0 2024-06-30
## 3 Montserrat North-America 4965.0 2024-06-30
## 4 Diamond-Princess 2024-06-30
## 5 Vatican-City Europe 799.0 2024-06-30
## 6 Western-Sahara Africa 626161.0 2024-06-30
## time Cases Recovered Deaths Tests
## 1 2024-06-30T16:15:16+00:00 2166 2.0
## 2 2024-06-30T16:15:16+00:00 1930 1930.0 8632.0
## 3 2024-06-30T16:15:16+00:00 1403 1376.0 8.0 17762.0
## 4 2024-06-30T16:15:16+00:00 712 699.0 13.0
## 5 2024-06-30T16:15:16+00:00 29 29.0
## 6 2024-06-30T16:15:16+00:00 10 9.0 1.0
The dataset consists of data about Covid 19: 1. Country name 2. Continent name 3. Population of the country 4. Last update day 5. Last update time 6. Total cases of covid 7. Number of recovered patients 8. Number of deaths 9. Number of conducted tests
library(psych)
library(naniar)
library(tidyr)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
covid <- replace_with_na_all(covid, condition = ~ .x == "")
covid1<- drop_na(covid)
covid1
## # A tibble: 169 × 9
## country continent population day time Cases Recovered Deaths Tests
## <chr> <chr> <chr> <chr> <chr> <int> <chr> <chr> <chr>
## 1 Montserrat North-Am… 4965.0 2024… 2024… 1403 1376.0 8.0 1776…
## 2 China Asia 144847140… 2024… 2024… 503302 379053.0 5272.0 1600…
## 3 Saint-Pierre-… North-Am… 5759.0 2024… 2024… 3452 2449.0 2.0 2540…
## 4 Djibouti Africa 1016097.0 2024… 2024… 15690 15427.0 189.0 3059…
## 5 Greenland North-Am… 56973.0 2024… 2024… 11971 2761.0 21.0 1649…
## 6 Eritrea Africa 3662244.0 2024… 2024… 10189 10086.0 103.0 2369…
## 7 Niger Africa 26083660.0 2024… 2024… 9931 8890.0 312.0 2545…
## 8 Equatorial-Gu… Africa 1496662.0 2024… 2024… 17229 16907.0 183.0 3656…
## 9 Liberia Africa 5305117.0 2024… 2024… 8090 7783.0 295.0 1398…
## 10 Nauru Oceania 10903.0 2024… 2024… 5393 5347.0 1.0 2050…
## # ℹ 159 more rows
I got rid of all the empty brackets.
covid1$Deaths <- as.numeric(as.character(covid1$Deaths))
covid1$population <- as.numeric(as.character(covid1$population))
covid1$Cases <- as.numeric(as.character(covid1$Cases))
covid1$DeathsPerCapita <- covid1$Deaths / covid1$population * 1000
covid1$CasesPerCapita <- covid1$Cases / covid1$population*1000
head(covid1)
## # A tibble: 6 × 11
## country continent population day time Cases Recovered Deaths Tests
## <chr> <chr> <dbl> <chr> <chr> <dbl> <chr> <dbl> <chr>
## 1 Montserrat North-Am… 4965 2024… 2024… 1403 1376.0 8 1776…
## 2 China Asia 1448471400 2024… 2024… 503302 379053.0 5272 1600…
## 3 Saint-Pierre-M… North-Am… 5759 2024… 2024… 3452 2449.0 2 2540…
## 4 Djibouti Africa 1016097 2024… 2024… 15690 15427.0 189 3059…
## 5 Greenland North-Am… 56973 2024… 2024… 11971 2761.0 21 1649…
## 6 Eritrea Africa 3662244 2024… 2024… 10189 10086.0 103 2369…
## # ℹ 2 more variables: DeathsPerCapita <dbl>, CasesPerCapita <dbl>
I added a column deaths and cases per capita to have a better comparison among countries.
covid2<-covid1[ ,c(1,10)]
print(covid2[order(-covid2$DeathsPerCapita), ])
## # A tibble: 169 × 2
## country DeathsPerCapita
## <chr> <dbl>
## 1 Peru 6.60
## 2 Bulgaria 5.66
## 3 Hungary 5.11
## 4 Bosnia-and-Herzegovina 5.04
## 5 North-Macedonia 4.79
## 6 Croatia 4.60
## 7 Montenegro 4.53
## 8 Czechia 4.05
## 9 Slovakia 3.89
## 10 San-Marino 3.76
## # ℹ 159 more rows
I created a new dataframe with only countries and deaths per capita so its more visible and practical.
describeBy(covid1[, c("DeathsPerCapita","CasesPerCapita","Recovered")])
## Warning in describeBy(covid1[, c("DeathsPerCapita", "CasesPerCapita",
## "Recovered")]): no grouping variable requested
## vars n mean sd median trimmed mad min max range
## DeathsPerCapita 1 169 1.28 1.37 0.80 1.07 1.07 0.00 6.60 6.59
## CasesPerCapita 2 169 184.74 197.28 116.17 157.64 165.33 0.35 771.65 771.31
## Recovered* 3 169 85.00 48.93 85.00 85.00 62.27 1.00 169.00 168.00
## skew kurtosis se
## DeathsPerCapita 1.28 1.27 0.11
## CasesPerCapita 0.99 -0.05 15.18
## Recovered* 0.00 -1.22 3.76
Fifty percent of countries had up to almost 120 cases per 1000 inhabitants. On average each country had 1,28 deaths per 1000 inhabitants. Standard deviation from the mean of the recovered patients is 48.93, which means the individual samples on average are 48.93 different from the mean.
ggplot(covid1, aes(y = log1p(DeathsPerCapita))) +
geom_boxplot(fill = "lightblue", color = "black") +
ylab("Deaths Per Capita") +
ggtitle("Box Plot of Deaths Per Capita") +
theme_minimal()
I can see from boxplot that most of countries experienced around 0,3-1,2 deaths per 1000 inhabitants.
scatterplot(covid1$Deaths~covid1$Cases,
smooth=FALSE,
boxplot=FALSE,
regLine=TRUE,
ylab="Deaths",xlab="Cases")
I can see from scatterplot that there is a positive corellation between
deaths and cases. If country has more cases it will on average result in
more deaths.
library(readxl)
mydata<-read_xlsx("./Business School.xlsx")
mydata<-as.data.frame(mydata)
head(mydata)
## Student ID Undergrad Degree Undergrad Grade MBA Grade Work Experience
## 1 1 Business 68.4 90.2 No
## 2 2 Computer Science 70.2 68.7 Yes
## 3 3 Finance 76.4 83.3 No
## 4 4 Business 82.6 88.7 No
## 5 5 Finance 76.9 75.4 No
## 6 6 Computer Science 83.3 82.1 No
## Employability (Before) Employability (After) Status Annual Salary
## 1 252 276 Placed 111000
## 2 101 119 Placed 107000
## 3 401 462 Placed 109000
## 4 287 342 Placed 148000
## 5 275 347 Placed 255500
## 6 254 313 Placed 103500
library(ggplot2)
ggplot(mydata, aes(x = `Undergrad Degree`)) +
geom_bar(binwidth=500000,color = "dodgerblue", fill = "lightblue") +
ylab("FREQUENCY") +
xlab("Undergraduate Degree") +
ggtitle("Frequency of Undergraduate Degrees")
## Warning in geom_bar(binwidth = 5e+05, color = "dodgerblue", fill =
## "lightblue"): Ignoring unknown parameters: `binwidth`
Histogram on Undergraduate Degrees. Most common is degree in
Business.
describeBy(mydata$`Annual Salary`)
## Warning in describeBy(mydata$`Annual Salary`): no grouping variable requested
## vars n mean sd median trimmed mad min max range skew
## X1 1 100 109058 41501.49 103500 104600.2 25945.5 20000 340000 320000 2.22
## kurtosis se
## X1 9.41 4150.15
summary(mydata$`Annual Salary`)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 20000 87125 103500 109058 124000 340000
library(ggplot2)
ggplot(mydata, aes(x = `Annual Salary`)) +
geom_histogram(binwidth = 20000, color = "black", fill = "lightblue") +
ylab("Frequency") +
xlab("Annual Salary") +
ggtitle("Distribution of Annual Salary") +
xlim(10000, 350000) +
theme_minimal()
Annual salary distribution is pretty symetrical. Most of the people earn around 100000 dollars.
t.test(mydata$`Undergrad Grade`,
mu=74,
alternative="two.sided")
##
## One Sample t-test
##
## data: mydata$`Undergrad Grade`
## t = 3.8851, df = 99, p-value = 0.0001849
## alternative hypothesis: true mean is not equal to 74
## 95 percent confidence interval:
## 75.41841 78.37959
## sample estimates:
## mean of x
## 76.899
P-value is 0.0001849, which is lower than 0.05, so we can reject the hypotheses and there is enough evidence that mean is significantly different from 74.
library(effectsize)
##
## Attaching package: 'effectsize'
## The following object is masked from 'package:psych':
##
## phi
cohens_d(mydata$`Undergrad Grade`,
mu=74)
## Cohen's d | 95% CI
## ------------------------
## 0.39 | [0.18, 0.59]
##
## - Deviation from a difference of 74.
Indicator is 0.39, meaning that the difference between the actual mean grade and 74 is not very large but also not trivial.
data<-read_xlsx("./Apartments.xlsx")
data<-as.data.frame(data)
head(data)
## Age Distance Price Parking Balcony
## 1 7 28 1640 0 1
## 2 18 1 2800 1 0
## 3 7 28 1660 0 0
## 4 28 29 1850 0 1
## 5 18 18 1640 1 1
## 6 28 12 1770 0 1
Description:
data$BalconyF<-factor(data$Balcony,
levels=c(1,0),
labels=c("YES","NO"))
data$ParkingF<-factor(data$Parking,
levels=c(1,0),
labels=c("YES","NO"))
head(data)
## Age Distance Price Parking Balcony BalconyF ParkingF
## 1 7 28 1640 0 1 YES NO
## 2 18 1 2800 1 0 NO YES
## 3 7 28 1660 0 0 NO NO
## 4 28 29 1850 0 1 YES NO
## 5 18 18 1640 1 1 YES YES
## 6 28 12 1770 0 1 YES NO
t.test(data$Price,
mu=1900,
alternative="two.sided")
##
## One Sample t-test
##
## data: data$Price
## t = 2.9022, df = 84, p-value = 0.004731
## alternative hypothesis: true mean is not equal to 1900
## 95 percent confidence interval:
## 1937.443 2100.440
## sample estimates:
## mean of x
## 2018.941
p-value is very low so we can reject the hypotheses and say that there is enough evidence for price to be significantly different from 1900.
library(car)
scatterplot(y = data$Price,
x = data$Age,
ylab = "Price",
xlab = "Age",
smooth = FALSE)
fit1 <- lm(Price ~ Age, data = data)
summary(fit1)
##
## Call:
## lm(formula = Price ~ Age, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -623.9 -278.0 -69.8 243.5 776.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2185.455 87.043 25.108 <2e-16 ***
## Age -8.975 4.164 -2.156 0.034 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 369.9 on 83 degrees of freedom
## Multiple R-squared: 0.05302, Adjusted R-squared: 0.04161
## F-statistic: 4.647 on 1 and 83 DF, p-value: 0.03401
cor(data$Price,data$Age)
## [1] -0.230255
p-value is less than 0.05 so we reject the hypoteses that there is no corellation between price and age. Coefficient of determination is 0.05302 which means age explains only around 5% of variability of price.Corellation coefficient is -0,23, which means there is weak linear relationship between price and age.
library(car)
scatterplotMatrix(data[ , c(-4,-5,-6, -7)],
smooth = FALSE)
data2 <- lm(Price ~ Age + Distance,
data = data)
vif(data2)
## Age Distance
## 1.001845 1.001845
There is no problem with multicolinearilty as VIF is less than 5, meaning explanatory variables are mostly independent. Also dispersion of points is not very correlated.
fit2 <- lm(Price ~ Age + Distance,
data = data)
summary(fit2)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -603.23 -219.94 -85.68 211.31 689.58
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2460.101 76.632 32.10 < 2e-16 ***
## Age -7.934 3.225 -2.46 0.016 *
## Distance -20.667 2.748 -7.52 6.18e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 286.3 on 82 degrees of freedom
## Multiple R-squared: 0.4396, Adjusted R-squared: 0.4259
## F-statistic: 32.16 on 2 and 82 DF, p-value: 4.896e-11
vif(fit2)
## Age Distance
## 1.001845 1.001845
As mentioned before, there is no strong colleration among the explanatory variables.
data$StdResid <- round(rstandard(fit2), 3)
data$CooksD <- round(cooks.distance(fit2), 3)
hist(data$StdResid,
xlab = "Standardized residuals",
ylab = "Frequency",
main = "Histogram of standardized residuals")
tail(data[order(data$StdResid),], 5)
## Age Distance Price Parking Balcony BalconyF ParkingF StdResid CooksD
## 58 8 2 2820 1 0 NO YES 1.655 0.037
## 2 18 1 2800 1 0 NO YES 1.783 0.030
## 61 18 1 2800 1 1 YES YES 1.783 0.030
## 33 2 11 2790 1 0 NO YES 2.051 0.069
## 38 5 45 2180 1 1 YES YES 2.577 0.320
head(data[order(data$StdResid),], 5)
## Age Distance Price Parking Balcony BalconyF ParkingF StdResid CooksD
## 53 7 2 1760 0 1 YES NO -2.152 0.066
## 13 12 14 1650 0 1 YES NO -1.499 0.013
## 72 12 14 1650 0 0 NO NO -1.499 0.013
## 20 13 8 1800 0 0 NO NO -1.381 0.012
## 35 14 16 1660 0 1 YES NO -1.261 0.008
We do not exclude any unit as their residual is not lower than -3 or higher than 3(based on standardized residuals).
head(data[order(-data$CooksD),], 6)
## Age Distance Price Parking Balcony BalconyF ParkingF StdResid CooksD
## 38 5 45 2180 1 1 YES YES 2.577 0.320
## 55 43 37 1740 0 0 NO NO 1.445 0.104
## 33 2 11 2790 1 0 NO YES 2.051 0.069
## 53 7 2 1760 0 1 YES NO -2.152 0.066
## 22 37 3 2540 1 1 YES YES 1.576 0.061
## 39 40 2 2400 0 1 YES NO 1.091 0.038
We exclude unit number 38 as it has a high influence.
hist(data$CooksD,
xlab = "Cooks distance",
ylab = "Frequency",
main = "Histogram of Cooks distances")
data <- data[-38, ]
head(data[order(-data$CooksD),], 6) #unit 38 removed
## Age Distance Price Parking Balcony BalconyF ParkingF StdResid CooksD
## 55 43 37 1740 0 0 NO NO 1.445 0.104
## 33 2 11 2790 1 0 NO YES 2.051 0.069
## 53 7 2 1760 0 1 YES NO -2.152 0.066
## 22 37 3 2540 1 1 YES YES 1.576 0.061
## 39 40 2 2400 0 1 YES NO 1.091 0.038
## 58 8 2 2820 1 0 NO YES 1.655 0.037
I removed 38 because it had high influence on the model.
fit2$StdFitted <- scale(fit2$fitted.values)
library(car)
scatterplot(y = fit2$StdResid, x = fit2$StdFitted,
ylab = "Standardized residuals",
xlab = "Standardized fitted values",
boxplots = FALSE,
regLine = FALSE,
smooth = FALSE)
Dots are relatively constant, so there is no heteskedasticity.
fit2 <- lm(Price ~ Age + Distance,
data = data)
summary(fit2)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -604.92 -229.63 -56.49 192.97 599.35
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2456.076 73.931 33.221 < 2e-16 ***
## Age -6.464 3.159 -2.046 0.044 *
## Distance -22.955 2.786 -8.240 2.52e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 276.1 on 81 degrees of freedom
## Multiple R-squared: 0.4838, Adjusted R-squared: 0.4711
## F-statistic: 37.96 on 2 and 81 DF, p-value: 2.339e-12
Both age and distance have a negative corellation with a price of apartment. One year older apartment is on average 6,4 units cheaper than one year younger. And apartment with one additional unit of distance is on average 22,9 units cheaper than the one closer. R squared is 0,48 which means 48% of variability of price is explained by age and distance. F-statistic shows that we can reject null hypotheses and this model is good and explains at least some of the variability.
sqrt(summary(fit2)$r.squared)
## [1] 0.6955609
Linear relationship between price, age and distance is 70%, medium to strong.
fit3 <- lm(Price ~ Age + Distance + ParkingF + BalconyF,
data = data)
summary(fit3)
##
## Call:
## lm(formula = Price ~ Age + Distance + ParkingF + BalconyF, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -473.21 -192.37 -28.89 204.17 558.77
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2482.048 79.263 31.314 < 2e-16 ***
## Age -5.821 3.074 -1.894 0.06190 .
## Distance -20.279 2.886 -7.026 6.66e-10 ***
## ParkingFNO -167.531 62.864 -2.665 0.00933 **
## BalconyFNO 15.207 59.201 0.257 0.79795
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 267.5 on 79 degrees of freedom
## Multiple R-squared: 0.5275, Adjusted R-squared: 0.5035
## F-statistic: 22.04 on 4 and 79 DF, p-value: 3.018e-12
anova(fit2,fit3)
## Analysis of Variance Table
##
## Model 1: Price ~ Age + Distance
## Model 2: Price ~ Age + Distance + ParkingF + BalconyF
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 81 6176767
## 2 79 5654480 2 522287 3.6485 0.03051 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Fit3 is better as p-value is 0,03051, lower than 0,05.
summary(fit3)
##
## Call:
## lm(formula = Price ~ Age + Distance + ParkingF + BalconyF, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -473.21 -192.37 -28.89 204.17 558.77
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2482.048 79.263 31.314 < 2e-16 ***
## Age -5.821 3.074 -1.894 0.06190 .
## Distance -20.279 2.886 -7.026 6.66e-10 ***
## ParkingFNO -167.531 62.864 -2.665 0.00933 **
## BalconyFNO 15.207 59.201 0.257 0.79795
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 267.5 on 79 degrees of freedom
## Multiple R-squared: 0.5275, Adjusted R-squared: 0.5035
## F-statistic: 22.04 on 4 and 79 DF, p-value: 3.018e-12
An apartment with parking is on average 167.5 units more expensive. Apartment having balcony is statistically irelevant as p-value is greater than 0.05. F-statistic shows that we can reject null hypotheses and this model is good and explains at least some of the variability.
data$Fitted <- fitted.values(fit3)
data$Residuals <- residuals(fit3)
head(data[ ,colnames(data)%in% c("Price", "Fitted", "Residuals")])
## Price Fitted Residuals
## 1 1640 1705.952 -65.95206
## 2 2800 2372.197 427.80292
## 3 1660 1721.159 -61.15894
## 4 1850 1563.431 286.56890
## 5 1640 2012.244 -372.24396
## 6 1770 1908.177 -138.17733
Based on the model, we would expect ID2 to be 2372 units, but it is instead 2800 units per square meter. The residual difference is therefore 427.8 units per square meter.