date <- read.csv("date_seminar_costuri_regiune.csv", header = TRUE, sep = ",")
# avem date de tip panel
#install.packages("ggplot2")
library(ggplot2)
summary(date)
## MSA Date Home_Value Structure_Cost
## Length:5796 Length:5796 Min. : 55094 Min. : 34894
## Class :character Class :character 1st Qu.: 123809 1st Qu.: 78809
## Mode :character Mode :character Median : 170911 Median :102434
## Mean : 223947 Mean :115065
## 3rd Qu.: 256604 3rd Qu.:145661
## Max. :1346489 Max. :289896
## Land_Value Land_Share Home_Price_Index Land_Price_Index
## Min. : 2755 Min. :0.0500 Min. :0.3076 Min. :0.0000
## 1st Qu.: 25260 1st Qu.:0.2107 1st Qu.:0.7522 1st Qu.:0.4819
## Median : 58408 Median :0.3544 Median :0.9971 Median :0.7812
## Mean : 108882 Mean :0.3797 Mean :1.0645 Mean :0.9060
## 3rd Qu.: 122911 3rd Qu.:0.5427 3rd Qu.:1.3000 3rd Qu.:1.1788
## Max. :1159348 Max. :0.8894 Max. :2.7172 Max. :4.8386
## region
## Length:5796
## Class :character
## Mode :character
##
##
##
dim(date)
## [1] 5796 9
# Variatii mari intre minimul si maximul valorii terenurilor, a locuintelor, a indicilor de pret
# Totusi, tinem cont ca datele sunt pe o perioada larga de 30 ani.
date_sf <- subset(date, MSA == "SANFRANCISCO")
# 126 de trimestre intre 1984 si 2016
# a) statistici descriptive
summary(date_sf)
## MSA Date Home_Value Structure_Cost
## Length:126 Length:126 Min. : 235636 Min. : 41935
## Class :character Class :character 1st Qu.: 406116 1st Qu.: 73330
## Mode :character Mode :character Median : 694572 Median :104522
## Mean : 692200 Mean :122310
## 3rd Qu.: 909208 3rd Qu.:177796
## Max. :1346489 Max. :257243
## Land_Value Land_Share Home_Price_Index Land_Price_Index
## Min. : 193701 Min. :0.7304 Min. :0.3472 Min. :0.2987
## 1st Qu.: 330138 1st Qu.:0.7973 1st Qu.:0.5985 1st Qu.:0.5476
## Median : 543178 Median :0.8211 Median :1.0236 Median :0.9750
## Mean : 569889 Mean :0.8204 Mean :1.0201 Mean :0.9993
## 3rd Qu.: 755749 3rd Qu.:0.8495 3rd Qu.:1.3399 3rd Qu.:1.3304
## Max. :1159348 Max. :0.8894 Max. :1.9843 Max. :2.0387
## region
## Length:126
## Class :character
## Mode :character
##
##
##
# Avem diferenta mare intre indicii de pret minim si maxim atat ai valorii locuintelor, cat si
# ai terenurilor
# La valoarea terenului maximul este de 203% si minimul 29%
# Avem amplitudini mari pentru home_value si land_value
# b) reprezentati grafic valoarea reala a terenului (plot valoarea terenurilor)
plot(date_sf$Land_Value, col = "gold", type = "l", main = "Valoarea terenurilor in San Francisco")

ts.plot(date_sf$Land_Value, col = "gold", main = "Valoarea terenurilor in San Francisco")

# afisati perioada de timp
plot.ts(ts(date_sf$Land_Value, frequency = 4, start = c(1984, 4)), col = "gold",
main = "Valoarea terenurilor in San Francisco", ylab = "Land value", xlab = "An")

# Transformam in valori reale
date_sf$Land_Value_Real <- date_sf$Land_Value / date_sf$Land_Price_Index
plot.ts(ts(date_sf[,c(10, 5)], frequency = 4, start = c(1984, 4)), col = "gold",
main = "Valoarea reala si nominala a terenurilor in San Francisco",
ylab = "Land value", xlab = "An")

# In valori nominale, valoarea terenurilor a avut o crestere in perioada premergatoare crizei,
# apoi a scazut si iarasi
# Identificati in ce trimestru s-a inregistrat valoare nominala a terenului
which.max(date_sf[,5])
## [1] 86
date_sf[86,5]
## [1] 1159348
# Valoarea terenului este cea mai mare in 2006 in primul trimestru
date_sf[which.max(date_sf$Home_Value), "Date"]
## [1] "2016Q1"
date_wc <- subset(date, region == "West Coast")
unique(date$region)
## [1] "Southeast" "East Coast" "Midwest" "Southwest" "West Coast"
unique(date$MSA)
## [1] "ATLANTA" "BALTIMORE" "BIRMINGHAM"
## [4] "BOSTON" "BUFFALO" "CHARLOTTE"
## [7] "CHICAGO" "CINCINNATI" "CLEVELAND"
## [10] "COLUMBUS" "DALLAS" "DENVER"
## [13] "DETROIT" "FORTWORTH" "HARTFORD"
## [16] "HOUSTON" "INDIANAPOLIS" "KANSASCITY"
## [19] "LOSANGELES" "MEMPHIS" "MIAMI"
## [22] "MILWAUKEE" "MINNEAPOLISSTPAUL" "NEWORLEANS"
## [25] "NEWYORK" "NORFOLK" "OAKLAND"
## [28] "OKLAHOMACITY" "PHILADELPHIA" "PHOENIX"
## [31] "PITTSBURGH" "PORTLAND" "PROVIDENCE"
## [34] "ROCHESTER" "SACRAMENTO" "SALTLAKECITY"
## [37] "SANANTONIO" "SANBERNADINO" "SANDIEGO"
## [40] "SANFRANCISCO" "SANJOSE" "SANTAANA"
## [43] "SEATTLE" "STLOUIS" "TAMPA"
## [46] "WASHINGTONDC"
summary(date_wc)
## MSA Date Home_Value Structure_Cost
## Length:1260 Length:1260 Min. : 83617 Min. : 41935
## Class :character Class :character 1st Qu.: 210139 1st Qu.: 81177
## Mode :character Mode :character Median : 316152 Median :107946
## Mean : 393883 Mean :124305
## 3rd Qu.: 493409 3rd Qu.:166961
## Max. :1346489 Max. :278519
## Land_Value Land_Share Home_Price_Index Land_Price_Index
## Min. : 19814 Min. :0.08667 Min. :0.3076 Min. :0.1809
## 1st Qu.: 108416 1st Qu.:0.51932 1st Qu.:0.6872 1st Qu.:0.5823
## Median : 194094 Median :0.64916 Median :1.0101 Median :1.0000
## Mean : 269578 Mean :0.62102 Mean :1.1430 Mean :1.1643
## 3rd Qu.: 352515 3rd Qu.:0.74352 3rd Qu.:1.5497 3rd Qu.:1.5817
## Max. :1159348 Max. :0.88936 Max. :2.6569 Max. :4.8386
## region
## Length:1260
## Class :character
## Mode :character
##
##
##
# Identificati in ce trimestru s-a inregistrat Land_Price_Index maxim
date_wc[which.max(date_wc$Land_Price_Index), "Date"]
## [1] "2006Q2"
# In 2006Q2
# idem Home_Price_Index
date_wc[which.max(date_wc$Home_Price_Index), "Date"]
## [1] "2006Q2"
# La fel
# Amplitudinea la home_value este destul de mare, mediana < media
date_chi <- subset(date, MSA == "CHICAGO")
summary(date_chi)
## MSA Date Home_Value Structure_Cost
## Length:126 Length:126 Min. : 95518 Min. : 69522
## Class :character Class :character 1st Qu.:153445 1st Qu.: 93099
## Mode :character Mode :character Median :215035 Median :120706
## Mean :211197 Mean :137419
## 3rd Qu.:260425 3rd Qu.:187915
## Max. :346159 Max. :239964
## Land_Value Land_Share Home_Price_Index Land_Price_Index
## Min. : 11119 Min. :0.0500 Min. :0.4483 Min. :0.1192
## 1st Qu.: 38338 1st Qu.:0.2796 1st Qu.:0.7202 1st Qu.:0.3966
## Median : 63358 Median :0.3912 Median :1.0093 Median :0.6728
## Mean : 73778 Mean :0.3480 Mean :0.9913 Mean :0.7895
## 3rd Qu.: 96265 3rd Qu.:0.4287 3rd Qu.:1.2224 3rd Qu.:1.0393
## Max. :173331 Max. :0.5097 Max. :1.6248 Max. :1.9134
## region
## Length:126
## Class :character
## Mode :character
##
##
##
plot(date_chi$Land_Value)

plot(date_sf$Land_Value, col = "red", type = "l", main = "Valoarea terenurilor in Chicago")

ts.plot(date_sf$Land_Value, col = "red", main = "Valoarea terenurilor in Chicago")

# afisati perioada de timp
plot.ts(ts(date_sf$Land_Value, frequency = 4, start = c(1984, 4)), col = "gold",
main = "Valoarea terenurilor in San Francisco", ylab = "Land value", xlab = "An")

# Transformam in valori reale
date_sf$Land_Value_Real <- date_sf$Land_Value / date_sf$Land_Price_Index
plot.ts(ts(date_sf[,c(10, 5)], frequency = 4, start = c(1984, 4)), col = "gold",
main = "Valoarea reala si nominala a terenurilor in San Francisco",
ylab = "Land value", xlab = "An")

date_mw <- subset(date, region == "Midwest")
summary(date_mw)
## MSA Date Home_Value Structure_Cost
## Length:1638 Length:1638 Min. : 55094 Min. : 52339
## Class :character Class :character 1st Qu.:109405 1st Qu.: 83312
## Mode :character Mode :character Median :139278 Median :103359
## Mean :145728 Mean :113002
## 3rd Qu.:174386 3rd Qu.:140738
## Max. :346159 Max. :239964
## Land_Value Land_Share Home_Price_Index Land_Price_Index
## Min. : 2755 Min. :0.0500 Min. :0.3593 Min. :0.0000001
## 1st Qu.: 13172 1st Qu.:0.0992 1st Qu.:0.7310 1st Qu.:0.2560125
## Median : 25498 Median :0.2076 Median :0.9798 Median :0.5760505
## Mean : 32726 Mean :0.2115 Mean :0.9638 Mean :0.6393360
## 3rd Qu.: 43108 3rd Qu.:0.3146 3rd Qu.:1.1660 3rd Qu.:0.9796820
## Max. :173331 Max. :0.5097 Max. :1.6552 Max. :2.3176107
## region
## Length:1638
## Class :character
## Mode :character
##
##
##
# Identificati in ce trimestru s-a inregistrat Land_Price_Index maxim
date_mw[which.max(date_mw$Land_Price_Index), "Date"]
## [1] "2006Q1"
# In 2006Q1
# idem Home_Price_Index
date_mw[which.max(date_mw$Home_Price_Index), "Date"]
## [1] "2006Q2"
# In 2006Q2
date_mw[which.max(date_mw$Structure_Cost), "Date"]
## [1] "2016Q1"
# Maximul pentru costul structural se obtine in 2016Q1
# Reprezentati in acelasi grafic valoarea locuintelor pentru 3 orase: Miami, Washington si SF
# Selectam cele 3 orase direct din ggplot
ggplot(subset(date, MSA %in% c("MIAMI", "WASHINGTONDC", "SANFRANCISCO")),
aes(x=Date, y=Home_Value, color = MSA)) + geom_point()

# Se observa ca valoarea cea mai mare
# Se revine la punctul b - pasul 3
# Regresie
# Pas 5 - Furnizati o regresie care sa exprime costul total al locuintelor din orasul Miami in
# functie de valoarea locuintelor. Estimam cu o functie de regresie liniara si una neliniara
# Structure_Cost ~ Home_Value
date_mi <- subset(date, MSA == "MIAMI")
ggplot(date_mi, aes(x=Home_Value, y=Structure_Cost, col = "aquamarine3")) + geom_point() +
geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

model1 <- lm(date_mi$Structure_Cost ~ date_mi$Home_Value)
summary(model1)
##
## Call:
## lm(formula = date_mi$Structure_Cost ~ date_mi$Home_Value)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57628 -16249 -7940 5022 56803
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.356e+04 6.139e+03 3.838 0.000197 ***
## date_mi$Home_Value 3.017e-01 2.243e-02 13.450 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 28590 on 124 degrees of freedom
## Multiple R-squared: 0.5933, Adjusted R-squared: 0.59
## F-statistic: 180.9 on 1 and 124 DF, p-value: < 2.2e-16
# 59,33% din variatia costului structural este explicat de
# La o crestere cu 1$ a locuintelor, costul creste cu 3.017e ^ (-1)
# Modelul e valid, coeficientii sunt semnificativi statistic
# Efectul marginal al cresterii valorilor locuintelor asupra costului este pozitiv si foarte mic
# A SE VEDEA CURSUL, A SE INVATA CURSUL
ggplot(date_mi, aes(x=log(Home_Value), y=log(Structure_Cost))) + geom_point(col = "lightblue")

model2 <- lm(log(date_mi$Structure_Cost) ~ log(date_mi$Home_Value))
summary(model2)
##
## Call:
## lm(formula = log(date_mi$Structure_Cost) ~ log(date_mi$Home_Value))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.45296 -0.13590 -0.01300 0.08752 0.43916
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.25280 0.55484 -0.456 0.649
## log(date_mi$Home_Value) 0.94477 0.04497 21.010 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2147 on 124 degrees of freedom
## Multiple R-squared: 0.7807, Adjusted R-squared: 0.7789
## F-statistic: 441.4 on 1 and 124 DF, p-value: < 2.2e-16
# Model valid, interceptul nu e semnificativ statistic
# R^2 este mai mare, deci este un model mai bun
# 78,08% din variatia costului este explicata de variatia locuintelor
# 0.944477 este elasticitatea costului total in raport cu cererea
E_cost <- model2$coefficients[2]
E_cost
## log(date_mi$Home_Value)
## 0.9447732
# La cresterea valorilor locuintelor cu 1%, costul locuintelor pe termen lung creste cu 0.944%.
# Valoarea apartine (0,1)
# Costul marginal unui apartament < Costul total al locuintelor
# Zona de eficienta economica
# Deoarece elasticitatea < 1, activitatea de constructie a locuintelor in Miami prezinta economii
# la scala, deci activitatea se afla intr-o zona de eficienta economica
# Altfel => dizeconomii la scala
# Modelul pentru orasul meu (Chicago)
ggplot(date_chi, aes(x=Home_Value, y=Structure_Cost, col = "aquamarine3")) + geom_point() +
geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

m1 <- lm(date_chi$Structure_Cost ~ date_chi$Home_Value)
summary(m1)
##
## Call:
## lm(formula = date_chi$Structure_Cost ~ date_chi$Home_Value)
##
## Residuals:
## Min 1Q Median 3Q Max
## -47502 -16334 -9395 1149 70201
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.134e+04 9.667e+03 1.173 0.243
## date_chi$Home_Value 5.970e-01 4.357e-02 13.702 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 33250 on 124 degrees of freedom
## Multiple R-squared: 0.6022, Adjusted R-squared: 0.599
## F-statistic: 187.8 on 1 and 124 DF, p-value: < 2.2e-16
ggplot(date_chi, aes(x=log(Home_Value), y=log(Structure_Cost))) + geom_point(col = "lightblue")

m2 <- lm(log(date_chi$Structure_Cost) ~ log(date_chi$Home_Value))
summary(m2)
##
## Call:
## lm(formula = log(date_chi$Structure_Cost) ~ log(date_chi$Home_Value))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.24910 -0.12245 -0.05806 0.09441 0.39835
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.03949 0.60118 -0.066 0.948
## log(date_chi$Home_Value) 0.96669 0.04924 19.633 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1888 on 124 degrees of freedom
## Multiple R-squared: 0.7566, Adjusted R-squared: 0.7546
## F-statistic: 385.5 on 1 and 124 DF, p-value: < 2.2e-16
# PENTRU DATA VIITOARE (TEMA)
# SA FACEM MODELE DE REGRESIE LINIARE SI NELINIARE PENTRU 3 ORASE
# INTERPRETARI GRAFICE, BOXPLOT-URI, HISTOGRAME, SK, KURT, ANALIZA DE REZIDUURI (ANALIZA CA AZI)
# A SE VEDEA EXPLICATIILE DE LA CURS
summary(date_mi$Home_Value)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 126865 153776 199783 249010 314385 538482
summary(date_mi$Structure_Cost)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 42553 59543 82511 98690 143387 187085
sd(date_mi$Home_Value, na.rm = TRUE)
## [1] 113987
sd(date_mi$Structure_Cost, na.rm = TRUE)
## [1] 44648.79
sd_home_mi <- sd(date_mi$Home_Value, na.rm = TRUE)
sd_cost_mi <- sd(date_mi$Structure_Cost, na.rm = TRUE)
sd_home_mi / mean(date_mi$Home_Value, na.rm = TRUE)
## [1] 0.4577617
sd_cost_mi / mean(date_mi$Structure_Cost, na.rm = TRUE)
## [1] 0.4524129
# Ambii coeficienti de variatie sunt sub 0.35 (35%), deci mediile preturilor si
# costurilor sunt reprezentative statistic, iar seriile sunt omogene
library(ggplot2)
library(moments)
skewness(date_mi$Home_Value, na.rm = TRUE)
## [1] 0.9689774
skewness(date_mi$Structure_Cost, na.rm = TRUE)
## [1] 0.5030657
# Skewness > 0 => avem o usoara asimetrie la dreapta (pozitiva)
kurtosis(date_mi$Home_Value, na.rm = TRUE)
## [1] 3.008205
kurtosis(date_mi$Structure_Cost, na.rm = TRUE)
## [1] 1.821421
# Ambele valori sunt < 3, deci distributiile sunt platicurtice (aplatizate)
# Grafice distributii Miami
par(mfrow = c(2, 2))
hist(date_mi$Home_Value, col = "royalblue", main = "Histograma valoare casa - Miami", xlab = "Valoare casa")
boxplot(date_mi$Home_Value, col = "royalblue", horizontal = TRUE, main = "Boxplot valoare casa - Miami")
hist(date_mi$Structure_Cost, col = "salmon", main = "Histograma cost structura - Miami", xlab = "Cost structura")
boxplot(date_mi$Structure_Cost, col = "salmon", horizontal = TRUE, main = "Boxplot cost structura - Miami")

# Din boxplot se vede ca nu avem outlieri vizibili
# Distributiile sunt relativ stabile, cu o usoara asimetrie
# Regresie liniara si neliniara Miami
m1_mi <- lm(date_mi$Structure_Cost ~ date_mi$Home_Value)
summary(m1_mi)
##
## Call:
## lm(formula = date_mi$Structure_Cost ~ date_mi$Home_Value)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57628 -16249 -7940 5022 56803
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.356e+04 6.139e+03 3.838 0.000197 ***
## date_mi$Home_Value 3.017e-01 2.243e-02 13.450 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 28590 on 124 degrees of freedom
## Multiple R-squared: 0.5933, Adjusted R-squared: 0.59
## F-statistic: 180.9 on 1 and 124 DF, p-value: < 2.2e-16
# R-squared este 0.5933, deci 59.33% din variatia costului structural este explicata de valoarea casei
# p-value pentru coeficienti este foarte mic (<0.05), deci modelul si variabilele sunt semnificative statistic
# La o crestere cu 1 a valorii locuintelor, costul structural creste cu 3.017e-01
m2_mi <- lm(log(date_mi$Structure_Cost) ~ log(date_mi$Home_Value))
summary(m2_mi)
##
## Call:
## lm(formula = log(date_mi$Structure_Cost) ~ log(date_mi$Home_Value))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.45296 -0.13590 -0.01300 0.08752 0.43916
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.25280 0.55484 -0.456 0.649
## log(date_mi$Home_Value) 0.94477 0.04497 21.010 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2147 on 124 degrees of freedom
## Multiple R-squared: 0.7807, Adjusted R-squared: 0.7789
## F-statistic: 441.4 on 1 and 124 DF, p-value: < 2.2e-16
# R-squared a crescut la 0.7808 (78.08%), deci modelul log-log este mai bun decat cel liniar
# Elasticitatea costului in raport cu valoarea este 0.944
# La o crestere cu 1 a valorii locuintelor, costul structural creste cu 0.94477
# Reprezentare regresie Miami
par(mfrow = c(1, 1))
plot(date_mi$Home_Value, date_mi$Structure_Cost, col = "blue", main = "Regresie liniara Miami", xlab = "Valoare casa", ylab = "Cost structura")
abline(m1_mi, col = "red", lwd = 2)

# analiza erori model log-log Miami
erori_mi <- m2_mi$residuals
mean(erori_mi)
## [1] -1.934376e-17
sd(erori_mi)
## [1] 0.2138231
skewness(erori_mi)
## [1] 0.1185884
kurtosis(erori_mi)
## [1] 2.567293
# Media erorilor este foarte aproape de 0 (e-17), ceea ce confirma ipoteza modelului clasic de regresie
# Skewness-ul erorilor este aproape de 0, deci reziduurile au o distributie relativ simetrica
par(mfrow = c(1, 2))
hist(erori_mi, col = "grey", main = "Hist. reziduuri - Miami", xlab = "Reziduuri")
boxplot(erori_mi, col = "grey", horizontal = TRUE, main = "Boxplot reziduuri - Miami")

# statistici descriptive Chicago
summary(date_chi$Home_Value)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 95518 153445 215035 211197 260425 346159
summary(date_chi$Structure_Cost)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 69522 93099 120706 137419 187915 239964
sd_home_chi <- sd(date_chi$Home_Value, na.rm = TRUE)
sd_cost_chi <- sd(date_chi$Structure_Cost, na.rm = TRUE)
sd_home_chi / mean(date_chi$Home_Value, na.rm = TRUE)
## [1] 0.3232225
sd_cost_chi / mean(date_chi$Structure_Cost, na.rm = TRUE)
## [1] 0.3821368
# Ambii coeficienti sunt mai mici de 0.35 (35%), deci datele sunt destul de omogene,
# iar media preturilor si a costurilor este reprezentativa pentru Chicago
skewness(date_chi$Home_Value, na.rm = TRUE)
## [1] 0.1746737
skewness(date_chi$Structure_Cost, na.rm = TRUE)
## [1] 0.4906416
kurtosis(date_chi$Home_Value, na.rm = TRUE)
## [1] 2.032091
kurtosis(date_chi$Structure_Cost, na.rm = TRUE)
## [1] 1.872484
# Ambele valori sunt < 3, deci distributiile sunt platicurtice
# grafice distributii Chicago
par(mfrow = c(2, 2))
hist(date_chi$Home_Value, col = "seagreen", main = "Histograma valoare casa - Chicago", xlab = "Valoare casa")
boxplot(date_chi$Home_Value, col = "seagreen", horizontal = TRUE, main = "Boxplot valoare casa - Chicago")
hist(date_chi$Structure_Cost, col = "orange", main = "Histograma cost structura - Chicago", xlab = "Cost structura")
boxplot(date_chi$Structure_Cost, col = "orange", horizontal = TRUE, main = "Boxplot cost structura - Chicago")

# Boxplot-urile nu arata outlieri importanti pe cost sau valoare
# regresie liniara si neliniara Chicago
m1_chi <- lm(date_chi$Structure_Cost ~ date_chi$Home_Value)
summary(m1_chi)
##
## Call:
## lm(formula = date_chi$Structure_Cost ~ date_chi$Home_Value)
##
## Residuals:
## Min 1Q Median 3Q Max
## -47502 -16334 -9395 1149 70201
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.134e+04 9.667e+03 1.173 0.243
## date_chi$Home_Value 5.970e-01 4.357e-02 13.702 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 33250 on 124 degrees of freedom
## Multiple R-squared: 0.6022, Adjusted R-squared: 0.599
## F-statistic: 187.8 on 1 and 124 DF, p-value: < 2.2e-16
# R-squared este de 0.6022 (60.22%), deci 60.22% din variatia costului structural este explicata de valoarea casei
# La o crestere cu 1 a valorii locuintelor, costul structural creste cu 5.970e-01
m2_chi <- lm(log(date_chi$Structure_Cost) ~ log(date_chi$Home_Value))
summary(m2_chi)
##
## Call:
## lm(formula = log(date_chi$Structure_Cost) ~ log(date_chi$Home_Value))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.24910 -0.12245 -0.05806 0.09441 0.39835
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.03949 0.60118 -0.066 0.948
## log(date_chi$Home_Value) 0.96669 0.04924 19.633 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1888 on 124 degrees of freedom
## Multiple R-squared: 0.7566, Adjusted R-squared: 0.7546
## F-statistic: 385.5 on 1 and 124 DF, p-value: < 2.2e-16
# R-squared este de 0.7566 (75.66%), deci 75.66% din variatia costului structural este explicata de valoarea casei
# La o crestere cu 1 a valorii locuintelor, costul structural creste cu 0.96669
# reprezentare regresie Chicago
par(mfrow = c(1, 1))
plot(date_chi$Home_Value, date_chi$Structure_Cost, col = "darkgreen", main = "Regresie liniara Chicago", xlab = "Valoare casa", ylab = "Cost structura")
abline(m1_chi, col = "red", lwd = 2)

# analiza erori model log-log Chicago
erori_chi <- m2_chi$residuals
mean(erori_chi)
## [1] -2.89482e-17
sd(erori_chi)
## [1] 0.1880798
skewness(erori_chi)
## [1] 0.802925
kurtosis(erori_chi)
## [1] 2.486434
# Media erorilor este practic 0
# Skewness-ul ne arata ca distributia erorilor este destul de simetrica
par(mfrow = c(1, 2))
hist(erori_chi, col = "grey", main = "Hist. reziduuri - Chicago", xlab = "Reziduuri")
boxplot(erori_chi, col = "grey", horizontal = TRUE, main = "Boxplot reziduuri - Chicago")

# statistici descriptive San Francisco
summary(date_sf$Home_Value)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 235636 406116 694572 692200 909208 1346489
summary(date_sf$Structure_Cost)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 41935 73330 104522 122310 177796 257243
sd_home_sf <- sd(date_sf$Home_Value, na.rm = TRUE)
sd_cost_sf <- sd(date_sf$Structure_Cost, na.rm = TRUE)
sd_home_sf / mean(date_sf$Home_Value, na.rm = TRUE)
## [1] 0.4802773
sd_cost_sf / mean(date_sf$Structure_Cost, na.rm = TRUE)
## [1] 0.5025894
# Ambii coeficienti sunt sub 35%, deci seriile de date sunt omogene si mediile sunt reprezentative
skewness(date_sf$Home_Value, na.rm = TRUE)
## [1] 0.4319539
skewness(date_sf$Structure_Cost, na.rm = TRUE)
## [1] 0.6024737
# Avem asimetrie la dreapta
kurtosis(date_sf$Home_Value, na.rm = TRUE)
## [1] 1.892514
kurtosis(date_sf$Structure_Cost, na.rm = TRUE)
## [1] 2.135844
# Valorile < 3 ne arata distributii aplatizate (platicurtice)
# grafice distributii San Francisco
par(mfrow = c(2, 2))
hist(date_sf$Home_Value, col = "purple", main = "Histograma valoare casa - San Francisco", xlab = "Valoare casa")
boxplot(date_sf$Home_Value, col = "purple", horizontal = TRUE, main = "Boxplot valoare casa - San Francisco")
hist(date_sf$Structure_Cost, col = "pink", main = "Histograma cost structura - San Francisco", xlab = "Cost structura")
boxplot(date_sf$Structure_Cost, col = "pink", horizontal = TRUE, main = "Boxplot cost structura - San Francisco")

# Boxplot-urile arata lipsa outlierilor, distributiile fiind curate si stabile in timp
# regresie liniara si neliniara San Francisco
m1_sf <- lm(date_sf$Structure_Cost ~ date_sf$Home_Value)
summary(m1_sf)
##
## Call:
## lm(formula = date_sf$Structure_Cost ~ date_sf$Home_Value)
##
## Residuals:
## Min 1Q Median 3Q Max
## -71698 -17403 -6231 11949 73480
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.632e+04 7.142e+03 2.285 0.024 *
## date_sf$Home_Value 1.531e-01 9.308e-03 16.451 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 34600 on 124 degrees of freedom
## Multiple R-squared: 0.6858, Adjusted R-squared: 0.6832
## F-statistic: 270.6 on 1 and 124 DF, p-value: < 2.2e-16
# R-squared este de 0.6858 (68.58%), deci 68.58% din variatia costului structural este explicata de valoarea casei
# p-value este foarte mic (<0.05), deci relatia liniara este semnificativa statistic
# La o crestere cu 1 a valorii locuintelor, costul structural creste cu 1.531e-01
m2_sf <- lm(log(date_sf$Structure_Cost) ~ log(date_sf$Home_Value))
summary(m2_sf)
##
## Call:
## lm(formula = log(date_sf$Structure_Cost) ~ log(date_sf$Home_Value))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.40725 -0.15074 -0.02867 0.12825 0.44538
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.77721 0.50104 -1.551 0.123
## log(date_sf$Home_Value) 0.92777 0.03757 24.693 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2123 on 124 degrees of freedom
## Multiple R-squared: 0.831, Adjusted R-squared: 0.8296
## F-statistic: 609.8 on 1 and 124 DF, p-value: < 2.2e-16
# R-squared este de 0.831 (83.1%), deci 83.1% din variatia costului structural este explicata de valoarea casei
# p-value este foarte mic (<0.05), deci relatia liniara este semnificativa statistic
# La o crestere cu 1 a valorii locuintelor, costul structural creste cu 0.92777
# reprezentare regresie San Francisco
par(mfrow = c(1, 1))
plot(date_sf$Home_Value, date_sf$Structure_Cost, col = "darkblue", main = "Regresie liniara San Francisco", xlab = "Valoare casa", ylab = "Cost structura")
abline(m1_sf, col = "red", lwd = 2)

# analiza erori model log-log San Francisco
erori_sf <- m2_sf$residuals
mean(erori_sf)
## [1] -1.29631e-17
sd(erori_sf)
## [1] 0.2114198
skewness(erori_sf)
## [1] 0.1833915
kurtosis(erori_sf)
## [1] 2.382807
# Media erorilor este extrem de aproape de 0, respectand ipotezele modelului
# Analiza descriptiva a erorilor arata ca acestea sunt distribuite simetric si fara extreme
par(mfrow = c(1, 2))
hist(erori_sf, col = "grey", main = "Hist. reziduuri - San Francisco", xlab = "Reziduuri")
boxplot(erori_sf, col = "grey", horizontal = TRUE, main = "Boxplot reziduuri - San Francisco")
