mydata = readRDS("povp_denar.rds") #Reading the data
head(mydata, 10) #Showing first 10 units
## n bdp m1 om dpol
## 1 1960 1972.998 555.3420 4.41 0
## 2 1961 2025.810 568.3810 4.35 1
## 3 1962 2129.657 565.9464 4.33 1
## 4 1963 2218.187 579.6249 4.26 1
## 5 1964 2342.910 596.1677 4.40 1
## 6 1965 2473.337 607.8845 4.49 1
## 7 1966 2622.746 602.7257 5.13 1
## 8 1967 2690.208 622.3984 5.51 1
## 9 1968 2801.024 638.7402 6.18 1
## 10 1969 2876.987 627.8860 7.03 0
Description:
mydata <- mydata[, -1] #Excluding the variable "n"
mydata$dpolF <- factor(mydata$dpol,
levels = c(0, 1),
labels = c("Republicans", "Democrats")) #Creating a factor for categorical variable
head(mydata)
## bdp m1 om dpol dpolF
## 1 1972.998 555.3420 4.41 0 Republicans
## 2 2025.810 568.3810 4.35 1 Democrats
## 3 2129.657 565.9464 4.33 1 Democrats
## 4 2218.187 579.6249 4.26 1 Democrats
## 5 2342.910 596.1677 4.40 1 Democrats
## 6 2473.337 607.8845 4.49 1 Democrats
names(mydata) <- c("GDP", "M1", "INTEREST_RATE", "PPARTY") #Renaming all four variables
library(pastecs)
round(stat.desc(mydata[-4]), 2) #Presenting the descriptive statistics
## GDP M1 INTEREST_RATE NA
## nbr.val 34.00 34.00 34.00 NA
## nbr.null 0.00 0.00 0.00 NA
## nbr.na 0.00 0.00 0.00 NA
## min 1973.00 555.34 4.26 NA
## max 5126.06 928.46 14.17 NA
## range 3153.07 373.12 9.91 NA
## sum 119341.19 22189.58 279.26 NA
## median 3457.05 621.25 8.29 NA
## mean 3510.04 652.63 8.21 NA
## SE.mean 161.26 15.43 0.47 NA
## CI.mean.0.95 328.08 31.39 0.96 NA
## var 884118.98 8091.01 7.55 NA
## std.dev 940.28 89.95 2.75 NA
## coef.var 0.27 0.14 0.33 NA
Explanation of the descriptive statistics:
library(car)
## Loading required package: carData
scatterplot(mydata$M1 ~ mydata$GDP,
smooth = FALSE,
boxplots = FALSE,
ylab = "Real money in circulation (USD billion)",
xlab = "Real gross domestic product (USD billion)")
Explanation: A higher amount of money in circulation contributes to a higher GDP (on average). POSITIVE RELATIONSHIP
hist(mydata$GDP,
main = "Distribution of the variable GDP",
ylab = "Frequency",
xlab = "Real gross domestic product (USD billion)",
breaks = seq(0, 6000, 500),
right = FALSE)
Explanation: we can clearly see the normal distribution.
boxplot(mydata$INTEREST_RATE)
Explanation: 3rd Qu:75% of all the interest rates were higher than 8.29%.
mydata1 <- read.table("./Body mass.csv", header=TRUE, sep=";", dec=",")
head(mydata1)
## ID Mass
## 1 1 62.1
## 2 2 64.5
## 3 3 56.5
## 4 4 53.4
## 5 5 61.3
## 6 6 62.2
library(pastecs)
round(stat.desc(mydata1[ , -1]), 2)
## nbr.val nbr.null nbr.na min max range
## 50.00 0.00 0.00 49.70 83.20 33.50
## sum median mean SE.mean CI.mean.0.95 var
## 3143.80 62.80 62.88 0.85 1.71 36.14
## std.dev coef.var
## 6.01 0.10
hist(mydata1$Mass,
main = "Distribution of the variable Mass",
ylab = "Frequency",
xlab = "Mass",
breaks = seq(40, 90, 10),
right = FALSE)
t.test(mydata1$Mass,
mu = 59.5,
alternative = "two.sided")
##
## One Sample t-test
##
## data: mydata1$Mass
## t = 3.9711, df = 49, p-value = 0.000234
## alternative hypothesis: true mean is not equal to 59.5
## 95 percent confidence interval:
## 61.16758 64.58442
## sample estimates:
## mean of x
## 62.876
We reject H0 at p<0.001. We found out that the average body weight in ninth graders is not 59.5 kg. The true average body weight in ninth graders is 62.88 kg.
library(readxl)
mydata2 <- read_xlsx("./Apartments.xlsx")
mydata2 <- as.data.frame(mydata2)
head(mydata2)
## Age Distance Price Parking Balcony
## 1 7 28 1640 0 1
## 2 18 1 2800 1 0
## 3 7 28 1660 0 0
## 4 28 29 1850 0 1
## 5 18 18 1640 1 1
## 6 28 12 1770 0 1
Description:
mydata2$ParkingF <- factor(mydata2$Parking,
levels = c(0, 1),
labels = c("No", "Yes"))
mydata2$BalconyF <- factor(mydata2$Balcony,
levels = c(0, 1),
labels = c("No", "Yes"))
head(mydata2)
## Age Distance Price Parking Balcony ParkingF BalconyF
## 1 7 28 1640 0 1 No Yes
## 2 18 1 2800 1 0 Yes No
## 3 7 28 1660 0 0 No No
## 4 28 29 1850 0 1 No Yes
## 5 18 18 1640 1 1 Yes Yes
## 6 28 12 1770 0 1 No Yes
t.test(mydata2$Price,
mu = 1900,
alternative = "two.sided")
##
## One Sample t-test
##
## data: mydata2$Price
## t = 2.9022, df = 84, p-value = 0.004731
## alternative hypothesis: true mean is not equal to 1900
## 95 percent confidence interval:
## 1937.443 2100.440
## sample estimates:
## mean of x
## 2018.941
We reject H0 at p<0.005. We found out that the average price per m2 in the Ljubljana region is not 1900 eur.
fit1 <- lm(Price ~ Age,
data = mydata2)
summary(fit1)
##
## Call:
## lm(formula = Price ~ Age, data = mydata2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -623.9 -278.0 -69.8 243.5 776.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2185.455 87.043 25.108 <2e-16 ***
## Age -8.975 4.164 -2.156 0.034 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 369.9 on 83 degrees of freedom
## Multiple R-squared: 0.05302, Adjusted R-squared: 0.04161
## F-statistic: 4.647 on 1 and 83 DF, p-value: 0.03401
cor(mydata2$Price, mydata2$Age)
## [1] -0.230255
regression coefficient b1 = -8.975 IfaAge of an apartment increases by 1 year, the the price per m2 decreases for 8.975 eur.
coefficient of correlation = -0.23 There is negative weak linear relationship between price and age.
coefficient of determination R - squared = 0.053 5.3% of variability of price can be explained by the effect of the age of the apartment.
library(car)
scatterplotMatrix(mydata2[ , c(-4,-5,-6,-7)],
smooth = FALSE)
On the matrix we can see that there is no potential problem with multicolinearity.
fit2 <- lm(Price ~ Age + Distance,
data = mydata2)
summary(fit2)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = mydata2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -603.23 -219.94 -85.68 211.31 689.58
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2460.101 76.632 32.10 < 2e-16 ***
## Age -7.934 3.225 -2.46 0.016 *
## Distance -20.667 2.748 -7.52 6.18e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 286.3 on 82 degrees of freedom
## Multiple R-squared: 0.4396, Adjusted R-squared: 0.4259
## F-statistic: 32.16 on 2 and 82 DF, p-value: 4.896e-11
vif(fit2)
## Age Distance
## 1.001845 1.001845
Explanation: both values are below 5. There’s no problem with multicolinearity between Age and Distance.
mydata2$StdResid <- round(rstandard(fit2), 3)
mydata2$CooksD <- round(cooks.distance(fit2), 3)
hist(mydata2$StdResid,
xlab = "Standardized residuals",
ylab = "Frequency",
main = "Histogram of standardized residuals")
head(mydata2[order(mydata2$StdResid),], 3)
## Age Distance Price Parking Balcony ParkingF BalconyF StdResid CooksD
## 53 7 2 1760 0 1 No Yes -2.152 0.066
## 13 12 14 1650 0 1 No Yes -1.499 0.013
## 72 12 14 1650 0 0 No No -1.499 0.013
head(mydata2[order(-mydata2$CooksD),], 6)
## Age Distance Price Parking Balcony ParkingF BalconyF StdResid CooksD
## 38 5 45 2180 1 1 Yes Yes 2.577 0.320
## 55 43 37 1740 0 0 No No 1.445 0.104
## 33 2 11 2790 1 0 Yes No 2.051 0.069
## 53 7 2 1760 0 1 No Yes -2.152 0.066
## 22 37 3 2540 1 1 Yes Yes 1.576 0.061
## 39 40 2 2400 0 1 No Yes 1.091 0.038
mydata2 <- mydata2[-38, ]
/////
shapiro.test(mydata2$StdResid)
##
## Shapiro-Wilk normality test
##
## data: mydata2$StdResid
## W = 0.94879, p-value = 0.002187
p < 0.05 we can reject H0. We can say that stand.residuals are not normally distributed.The graph is in row 220.
fit2 <- lm(Price ~ Age + Distance,
data = mydata2)
summary(fit2)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = mydata2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -604.92 -229.63 -56.49 192.97 599.35
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2456.076 73.931 33.221 < 2e-16 ***
## Age -6.464 3.159 -2.046 0.044 *
## Distance -22.955 2.786 -8.240 2.52e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 276.1 on 81 degrees of freedom
## Multiple R-squared: 0.4838, Adjusted R-squared: 0.4711
## F-statistic: 37.96 on 2 and 81 DF, p-value: 2.339e-12
If age of an apartment increases by 1 year, the the price per m2 decreases for 7.934 eur.
If distance from city center increases by 1 kmr, the the price per m2 decreases for 20.667 eur.
fit3 <- lm(Price ~ Age + Distance + ParkingF + BalconyF,
data = mydata2)
summary(fit3)
##
## Call:
## lm(formula = Price ~ Age + Distance + ParkingF + BalconyF, data = mydata2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -473.21 -192.37 -28.89 204.17 558.77
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2329.724 93.066 25.033 < 2e-16 ***
## Age -5.821 3.074 -1.894 0.06190 .
## Distance -20.279 2.886 -7.026 6.66e-10 ***
## ParkingFYes 167.531 62.864 2.665 0.00933 **
## BalconyFYes -15.207 59.201 -0.257 0.79795
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 267.5 on 79 degrees of freedom
## Multiple R-squared: 0.5275, Adjusted R-squared: 0.5035
## F-statistic: 22.04 on 4 and 79 DF, p-value: 3.018e-12
anova(fit3, fit2)
## Analysis of Variance Table
##
## Model 1: Price ~ Age + Distance + ParkingF + BalconyF
## Model 2: Price ~ Age + Distance
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 79 5654480
## 2 81 6176767 -2 -522287 3.6485 0.03051 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
We reject H0 (p<0.05). Model 2 is better. It explains more variability in pricing.
fit3 <- lm(Price ~ Age + Distance + ParkingF + BalconyF,
data = mydata2)
summary(fit3)
##
## Call:
## lm(formula = Price ~ Age + Distance + ParkingF + BalconyF, data = mydata2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -473.21 -192.37 -28.89 204.17 558.77
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2329.724 93.066 25.033 < 2e-16 ***
## Age -5.821 3.074 -1.894 0.06190 .
## Distance -20.279 2.886 -7.026 6.66e-10 ***
## ParkingFYes 167.531 62.864 2.665 0.00933 **
## BalconyFYes -15.207 59.201 -0.257 0.79795
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 267.5 on 79 degrees of freedom
## Multiple R-squared: 0.5275, Adjusted R-squared: 0.5035
## F-statistic: 22.04 on 4 and 79 DF, p-value: 3.018e-12
Assuming that all other variables are the same, the price per m2 of the apartment with parking in Ljubljana region is higher for 196.168 eur compared to the apartment without parking (p<0.01).(??)
We can not say that balcony has statistical effect on price per m2 in Ljubljana region (p>0.05).
///