date <- read.csv("date_seminar_costuri_regiune.csv", header = TRUE, sep = ",")
# avem date de tip panel 

#install.packages("ggplot2")
library(ggplot2)

summary(date)
##      MSA                Date             Home_Value      Structure_Cost  
##  Length:5796        Length:5796        Min.   :  55094   Min.   : 34894  
##  Class :character   Class :character   1st Qu.: 123809   1st Qu.: 78809  
##  Mode  :character   Mode  :character   Median : 170911   Median :102434  
##                                        Mean   : 223947   Mean   :115065  
##                                        3rd Qu.: 256604   3rd Qu.:145661  
##                                        Max.   :1346489   Max.   :289896  
##    Land_Value        Land_Share     Home_Price_Index Land_Price_Index
##  Min.   :   2755   Min.   :0.0500   Min.   :0.3076   Min.   :0.0000  
##  1st Qu.:  25260   1st Qu.:0.2107   1st Qu.:0.7522   1st Qu.:0.4819  
##  Median :  58408   Median :0.3544   Median :0.9971   Median :0.7812  
##  Mean   : 108882   Mean   :0.3797   Mean   :1.0645   Mean   :0.9060  
##  3rd Qu.: 122911   3rd Qu.:0.5427   3rd Qu.:1.3000   3rd Qu.:1.1788  
##  Max.   :1159348   Max.   :0.8894   Max.   :2.7172   Max.   :4.8386  
##     region         
##  Length:5796       
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
dim(date)
## [1] 5796    9
# Variatii mari intre minimul si maximul valorii terenurilor, a locuintelor, a indicilor de pret
# Totusi, tinem cont ca datele sunt pe o perioada larga de 30 ani.

date_sf <- subset(date, MSA == "SANFRANCISCO")
# 126 de trimestre intre 1984 si 2016

# a) statistici descriptive
summary(date_sf)
##      MSA                Date             Home_Value      Structure_Cost  
##  Length:126         Length:126         Min.   : 235636   Min.   : 41935  
##  Class :character   Class :character   1st Qu.: 406116   1st Qu.: 73330  
##  Mode  :character   Mode  :character   Median : 694572   Median :104522  
##                                        Mean   : 692200   Mean   :122310  
##                                        3rd Qu.: 909208   3rd Qu.:177796  
##                                        Max.   :1346489   Max.   :257243  
##    Land_Value        Land_Share     Home_Price_Index Land_Price_Index
##  Min.   : 193701   Min.   :0.7304   Min.   :0.3472   Min.   :0.2987  
##  1st Qu.: 330138   1st Qu.:0.7973   1st Qu.:0.5985   1st Qu.:0.5476  
##  Median : 543178   Median :0.8211   Median :1.0236   Median :0.9750  
##  Mean   : 569889   Mean   :0.8204   Mean   :1.0201   Mean   :0.9993  
##  3rd Qu.: 755749   3rd Qu.:0.8495   3rd Qu.:1.3399   3rd Qu.:1.3304  
##  Max.   :1159348   Max.   :0.8894   Max.   :1.9843   Max.   :2.0387  
##     region         
##  Length:126        
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
# Avem diferenta mare intre indicii de pret minim si maxim atat ai valorii locuintelor, cat si
# ai terenurilor
# La valoarea terenului maximul este de 203% si minimul 29%
# Avem amplitudini mari pentru home_value si land_value

# b) reprezentati grafic valoarea reala a terenului (plot valoarea terenurilor)
plot(date_sf$Land_Value, col = "gold", type = "l", main = "Valoarea terenurilor in San Francisco")

ts.plot(date_sf$Land_Value, col = "gold", main = "Valoarea terenurilor in San Francisco")

# afisati perioada de timp
plot.ts(ts(date_sf$Land_Value, frequency = 4, start = c(1984, 4)), col = "gold",
        main = "Valoarea terenurilor in San Francisco", ylab = "Land value", xlab = "An")

# Transformam in valori reale
date_sf$Land_Value_Real <- date_sf$Land_Value / date_sf$Land_Price_Index
plot.ts(ts(date_sf[,c(10, 5)], frequency = 4, start = c(1984, 4)), col = "gold",
        main = "Valoarea reala si nominala a terenurilor in San Francisco",
        ylab = "Land value", xlab = "An")

# In valori nominale, valoarea terenurilor a avut o crestere in perioada premergatoare crizei,
# apoi a scazut si iarasi 
# Identificati in ce trimestru s-a inregistrat valoare nominala a terenului
which.max(date_sf[,5])
## [1] 86
date_sf[86,5]
## [1] 1159348
# Valoarea terenului este cea mai mare in 2006 in primul trimestru
date_sf[which.max(date_sf$Home_Value), "Date"]
## [1] "2016Q1"
date_wc <- subset(date, region == "West Coast")
unique(date$region)
## [1] "Southeast"  "East Coast" "Midwest"    "Southwest"  "West Coast"
unique(date$MSA)
##  [1] "ATLANTA"           "BALTIMORE"         "BIRMINGHAM"       
##  [4] "BOSTON"            "BUFFALO"           "CHARLOTTE"        
##  [7] "CHICAGO"           "CINCINNATI"        "CLEVELAND"        
## [10] "COLUMBUS"          "DALLAS"            "DENVER"           
## [13] "DETROIT"           "FORTWORTH"         "HARTFORD"         
## [16] "HOUSTON"           "INDIANAPOLIS"      "KANSASCITY"       
## [19] "LOSANGELES"        "MEMPHIS"           "MIAMI"            
## [22] "MILWAUKEE"         "MINNEAPOLISSTPAUL" "NEWORLEANS"       
## [25] "NEWYORK"           "NORFOLK"           "OAKLAND"          
## [28] "OKLAHOMACITY"      "PHILADELPHIA"      "PHOENIX"          
## [31] "PITTSBURGH"        "PORTLAND"          "PROVIDENCE"       
## [34] "ROCHESTER"         "SACRAMENTO"        "SALTLAKECITY"     
## [37] "SANANTONIO"        "SANBERNADINO"      "SANDIEGO"         
## [40] "SANFRANCISCO"      "SANJOSE"           "SANTAANA"         
## [43] "SEATTLE"           "STLOUIS"           "TAMPA"            
## [46] "WASHINGTONDC"
summary(date_wc)
##      MSA                Date             Home_Value      Structure_Cost  
##  Length:1260        Length:1260        Min.   :  83617   Min.   : 41935  
##  Class :character   Class :character   1st Qu.: 210139   1st Qu.: 81177  
##  Mode  :character   Mode  :character   Median : 316152   Median :107946  
##                                        Mean   : 393883   Mean   :124305  
##                                        3rd Qu.: 493409   3rd Qu.:166961  
##                                        Max.   :1346489   Max.   :278519  
##    Land_Value        Land_Share      Home_Price_Index Land_Price_Index
##  Min.   :  19814   Min.   :0.08667   Min.   :0.3076   Min.   :0.1809  
##  1st Qu.: 108416   1st Qu.:0.51932   1st Qu.:0.6872   1st Qu.:0.5823  
##  Median : 194094   Median :0.64916   Median :1.0101   Median :1.0000  
##  Mean   : 269578   Mean   :0.62102   Mean   :1.1430   Mean   :1.1643  
##  3rd Qu.: 352515   3rd Qu.:0.74352   3rd Qu.:1.5497   3rd Qu.:1.5817  
##  Max.   :1159348   Max.   :0.88936   Max.   :2.6569   Max.   :4.8386  
##     region         
##  Length:1260       
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
# Identificati in ce trimestru s-a inregistrat Land_Price_Index maxim
date_wc[which.max(date_wc$Land_Price_Index), "Date"]
## [1] "2006Q2"
# In 2006Q2

# idem Home_Price_Index
date_wc[which.max(date_wc$Home_Price_Index), "Date"]
## [1] "2006Q2"
# La fel

# Amplitudinea la home_value este destul de mare, mediana < media


date_chi <- subset(date, MSA == "CHICAGO") 
summary(date_chi)
##      MSA                Date             Home_Value     Structure_Cost  
##  Length:126         Length:126         Min.   : 95518   Min.   : 69522  
##  Class :character   Class :character   1st Qu.:153445   1st Qu.: 93099  
##  Mode  :character   Mode  :character   Median :215035   Median :120706  
##                                        Mean   :211197   Mean   :137419  
##                                        3rd Qu.:260425   3rd Qu.:187915  
##                                        Max.   :346159   Max.   :239964  
##    Land_Value       Land_Share     Home_Price_Index Land_Price_Index
##  Min.   : 11119   Min.   :0.0500   Min.   :0.4483   Min.   :0.1192  
##  1st Qu.: 38338   1st Qu.:0.2796   1st Qu.:0.7202   1st Qu.:0.3966  
##  Median : 63358   Median :0.3912   Median :1.0093   Median :0.6728  
##  Mean   : 73778   Mean   :0.3480   Mean   :0.9913   Mean   :0.7895  
##  3rd Qu.: 96265   3rd Qu.:0.4287   3rd Qu.:1.2224   3rd Qu.:1.0393  
##  Max.   :173331   Max.   :0.5097   Max.   :1.6248   Max.   :1.9134  
##     region         
##  Length:126        
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
plot(date_chi$Land_Value)

plot(date_sf$Land_Value, col = "red", type = "l", main = "Valoarea terenurilor in Chicago")

ts.plot(date_sf$Land_Value, col = "red", main = "Valoarea terenurilor in Chicago")

# afisati perioada de timp
plot.ts(ts(date_sf$Land_Value, frequency = 4, start = c(1984, 4)), col = "gold",
        main = "Valoarea terenurilor in San Francisco", ylab = "Land value", xlab = "An")

# Transformam in valori reale
date_sf$Land_Value_Real <- date_sf$Land_Value / date_sf$Land_Price_Index
plot.ts(ts(date_sf[,c(10, 5)], frequency = 4, start = c(1984, 4)), col = "gold",
        main = "Valoarea reala si nominala a terenurilor in San Francisco",
        ylab = "Land value", xlab = "An")

date_mw <- subset(date, region == "Midwest")

summary(date_mw)
##      MSA                Date             Home_Value     Structure_Cost  
##  Length:1638        Length:1638        Min.   : 55094   Min.   : 52339  
##  Class :character   Class :character   1st Qu.:109405   1st Qu.: 83312  
##  Mode  :character   Mode  :character   Median :139278   Median :103359  
##                                        Mean   :145728   Mean   :113002  
##                                        3rd Qu.:174386   3rd Qu.:140738  
##                                        Max.   :346159   Max.   :239964  
##    Land_Value       Land_Share     Home_Price_Index Land_Price_Index   
##  Min.   :  2755   Min.   :0.0500   Min.   :0.3593   Min.   :0.0000001  
##  1st Qu.: 13172   1st Qu.:0.0992   1st Qu.:0.7310   1st Qu.:0.2560125  
##  Median : 25498   Median :0.2076   Median :0.9798   Median :0.5760505  
##  Mean   : 32726   Mean   :0.2115   Mean   :0.9638   Mean   :0.6393360  
##  3rd Qu.: 43108   3rd Qu.:0.3146   3rd Qu.:1.1660   3rd Qu.:0.9796820  
##  Max.   :173331   Max.   :0.5097   Max.   :1.6552   Max.   :2.3176107  
##     region         
##  Length:1638       
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
# Identificati in ce trimestru s-a inregistrat Land_Price_Index maxim
date_mw[which.max(date_mw$Land_Price_Index), "Date"]
## [1] "2006Q1"
# In 2006Q1

# idem Home_Price_Index
date_mw[which.max(date_mw$Home_Price_Index), "Date"]
## [1] "2006Q2"
# In 2006Q2
date_mw[which.max(date_mw$Structure_Cost), "Date"]
## [1] "2016Q1"
# Maximul pentru costul structural se obtine in 2016Q1

# Reprezentati in acelasi grafic valoarea locuintelor pentru 3 orase: Miami, Washington si SF
# Selectam cele 3 orase direct din ggplot
ggplot(subset(date, MSA %in% c("MIAMI", "WASHINGTONDC", "SANFRANCISCO")), 
       aes(x=Date, y=Home_Value, color = MSA)) + geom_point()

# Se observa ca valoarea cea mai mare


# Se revine la punctul b - pasul 3

# Regresie
# Pas 5 - Furnizati o regresie care sa exprime costul total al locuintelor din orasul Miami in 
# functie de valoarea locuintelor. Estimam cu o functie de regresie liniara si una neliniara

# Structure_Cost ~ Home_Value
date_mi <- subset(date, MSA == "MIAMI")
ggplot(date_mi, aes(x=Home_Value, y=Structure_Cost, col = "aquamarine3")) + geom_point() + 
  geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

model1 <- lm(date_mi$Structure_Cost ~ date_mi$Home_Value)
summary(model1)
## 
## Call:
## lm(formula = date_mi$Structure_Cost ~ date_mi$Home_Value)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -57628 -16249  -7940   5022  56803 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        2.356e+04  6.139e+03   3.838 0.000197 ***
## date_mi$Home_Value 3.017e-01  2.243e-02  13.450  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 28590 on 124 degrees of freedom
## Multiple R-squared:  0.5933, Adjusted R-squared:   0.59 
## F-statistic: 180.9 on 1 and 124 DF,  p-value: < 2.2e-16
# 59,33% din variatia costului structural este explicat de 
# La o crestere cu 1$ a locuintelor, costul creste cu 3.017e ^ (-1)
# Modelul e valid, coeficientii sunt semnificativi statistic
# Efectul marginal al cresterii valorilor locuintelor asupra costului este pozitiv si foarte mic

# A SE VEDEA CURSUL, A SE INVATA CURSUL

ggplot(date_mi, aes(x=log(Home_Value), y=log(Structure_Cost))) + geom_point(col = "lightblue") 

model2 <- lm(log(date_mi$Structure_Cost) ~ log(date_mi$Home_Value))
summary(model2)
## 
## Call:
## lm(formula = log(date_mi$Structure_Cost) ~ log(date_mi$Home_Value))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.45296 -0.13590 -0.01300  0.08752  0.43916 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             -0.25280    0.55484  -0.456    0.649    
## log(date_mi$Home_Value)  0.94477    0.04497  21.010   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2147 on 124 degrees of freedom
## Multiple R-squared:  0.7807, Adjusted R-squared:  0.7789 
## F-statistic: 441.4 on 1 and 124 DF,  p-value: < 2.2e-16
# Model valid, interceptul nu e semnificativ statistic
# R^2 este mai mare, deci este un model mai bun
# 78,08% din variatia costului este explicata de variatia locuintelor
# 0.944477 este elasticitatea costului total in raport cu cererea
E_cost <- model2$coefficients[2]
E_cost
## log(date_mi$Home_Value) 
##               0.9447732
# La cresterea valorilor locuintelor cu 1%, costul locuintelor pe termen lung creste cu 0.944%.
# Valoarea apartine (0,1)
# Costul marginal unui apartament < Costul total al locuintelor
# Zona de eficienta economica
# Deoarece elasticitatea < 1, activitatea de constructie a locuintelor in Miami prezinta economii
# la scala, deci activitatea se afla intr-o zona de eficienta economica
# Altfel => dizeconomii la scala

# Modelul pentru orasul meu (Chicago)
ggplot(date_chi, aes(x=Home_Value, y=Structure_Cost, col = "aquamarine3")) + geom_point() + 
  geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

m1 <- lm(date_chi$Structure_Cost ~ date_chi$Home_Value)
summary(m1)
## 
## Call:
## lm(formula = date_chi$Structure_Cost ~ date_chi$Home_Value)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -47502 -16334  -9395   1149  70201 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         1.134e+04  9.667e+03   1.173    0.243    
## date_chi$Home_Value 5.970e-01  4.357e-02  13.702   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 33250 on 124 degrees of freedom
## Multiple R-squared:  0.6022, Adjusted R-squared:  0.599 
## F-statistic: 187.8 on 1 and 124 DF,  p-value: < 2.2e-16
ggplot(date_chi, aes(x=log(Home_Value), y=log(Structure_Cost))) + geom_point(col = "lightblue") 

m2 <- lm(log(date_chi$Structure_Cost) ~ log(date_chi$Home_Value))
summary(m2)
## 
## Call:
## lm(formula = log(date_chi$Structure_Cost) ~ log(date_chi$Home_Value))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.24910 -0.12245 -0.05806  0.09441  0.39835 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -0.03949    0.60118  -0.066    0.948    
## log(date_chi$Home_Value)  0.96669    0.04924  19.633   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1888 on 124 degrees of freedom
## Multiple R-squared:  0.7566, Adjusted R-squared:  0.7546 
## F-statistic: 385.5 on 1 and 124 DF,  p-value: < 2.2e-16
# PENTRU DATA VIITOARE (TEMA)
# SA FACEM MODELE DE REGRESIE LINIARE SI NELINIARE PENTRU 3 ORASE
# INTERPRETARI GRAFICE, BOXPLOT-URI, HISTOGRAME, SK, KURT, ANALIZA DE REZIDUURI (ANALIZA CA AZI)

# A SE VEDEA EXPLICATIILE DE LA CURS

summary(date_mi$Home_Value)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  126865  153776  199783  249010  314385  538482
summary(date_mi$Structure_Cost)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   42553   59543   82511   98690  143387  187085
sd(date_mi$Home_Value, na.rm = TRUE)
## [1] 113987
sd(date_mi$Structure_Cost, na.rm = TRUE)
## [1] 44648.79
sd_home_mi <- sd(date_mi$Home_Value, na.rm = TRUE)
sd_cost_mi <- sd(date_mi$Structure_Cost, na.rm = TRUE)

sd_home_mi / mean(date_mi$Home_Value, na.rm = TRUE)
## [1] 0.4577617
sd_cost_mi / mean(date_mi$Structure_Cost, na.rm = TRUE)
## [1] 0.4524129
# Ambii coeficienti de variatie sunt sub 0.35 (35%), deci mediile preturilor si 
# costurilor sunt reprezentative statistic, iar seriile sunt omogene

library(ggplot2)
library(moments)

skewness(date_mi$Home_Value, na.rm = TRUE)
## [1] 0.9689774
skewness(date_mi$Structure_Cost, na.rm = TRUE)
## [1] 0.5030657
# Skewness > 0 => avem o usoara asimetrie la dreapta (pozitiva)

kurtosis(date_mi$Home_Value, na.rm = TRUE)
## [1] 3.008205
kurtosis(date_mi$Structure_Cost, na.rm = TRUE)
## [1] 1.821421
# Ambele valori sunt < 3, deci distributiile sunt platicurtice (aplatizate)

# Grafice distributii Miami
par(mfrow = c(2, 2))
hist(date_mi$Home_Value, col = "royalblue", main = "Histograma valoare casa - Miami", xlab = "Valoare casa")
boxplot(date_mi$Home_Value, col = "royalblue", horizontal = TRUE, main = "Boxplot valoare casa - Miami")
hist(date_mi$Structure_Cost, col = "salmon", main = "Histograma cost structura - Miami", xlab = "Cost structura")
boxplot(date_mi$Structure_Cost, col = "salmon", horizontal = TRUE, main = "Boxplot cost structura - Miami")

# Din boxplot se vede ca nu avem outlieri vizibili 
# Distributiile sunt relativ stabile, cu o usoara asimetrie

# Regresie liniara si neliniara Miami
m1_mi <- lm(date_mi$Structure_Cost ~ date_mi$Home_Value)
summary(m1_mi)
## 
## Call:
## lm(formula = date_mi$Structure_Cost ~ date_mi$Home_Value)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -57628 -16249  -7940   5022  56803 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        2.356e+04  6.139e+03   3.838 0.000197 ***
## date_mi$Home_Value 3.017e-01  2.243e-02  13.450  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 28590 on 124 degrees of freedom
## Multiple R-squared:  0.5933, Adjusted R-squared:   0.59 
## F-statistic: 180.9 on 1 and 124 DF,  p-value: < 2.2e-16
# R-squared este 0.5933, deci 59.33% din variatia costului structural este explicata de valoarea casei
# p-value pentru coeficienti este foarte mic (<0.05), deci modelul si variabilele sunt semnificative statistic 
# La o crestere cu 1 a valorii locuintelor, costul structural creste cu 3.017e-01

m2_mi <- lm(log(date_mi$Structure_Cost) ~ log(date_mi$Home_Value))
summary(m2_mi)
## 
## Call:
## lm(formula = log(date_mi$Structure_Cost) ~ log(date_mi$Home_Value))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.45296 -0.13590 -0.01300  0.08752  0.43916 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             -0.25280    0.55484  -0.456    0.649    
## log(date_mi$Home_Value)  0.94477    0.04497  21.010   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2147 on 124 degrees of freedom
## Multiple R-squared:  0.7807, Adjusted R-squared:  0.7789 
## F-statistic: 441.4 on 1 and 124 DF,  p-value: < 2.2e-16
# R-squared a crescut la 0.7808 (78.08%), deci modelul log-log este mai bun decat cel liniar
# Elasticitatea costului in raport cu valoarea este 0.944 
# La o crestere cu 1 a valorii locuintelor, costul structural creste cu 0.94477

# Reprezentare regresie Miami
par(mfrow = c(1, 1))
plot(date_mi$Home_Value, date_mi$Structure_Cost, col = "blue", main = "Regresie liniara Miami", xlab = "Valoare casa", ylab = "Cost structura")
abline(m1_mi, col = "red", lwd = 2)

# analiza erori model log-log Miami
erori_mi <- m2_mi$residuals
mean(erori_mi)
## [1] -1.934376e-17
sd(erori_mi)
## [1] 0.2138231
skewness(erori_mi)
## [1] 0.1185884
kurtosis(erori_mi)
## [1] 2.567293
# Media erorilor este foarte aproape de 0 (e-17), ceea ce confirma ipoteza modelului clasic de regresie
# Skewness-ul erorilor este aproape de 0, deci reziduurile au o distributie relativ simetrica

par(mfrow = c(1, 2))
hist(erori_mi, col = "grey", main = "Hist. reziduuri - Miami", xlab = "Reziduuri")
boxplot(erori_mi, col = "grey", horizontal = TRUE, main = "Boxplot reziduuri - Miami")

# statistici descriptive Chicago
summary(date_chi$Home_Value)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   95518  153445  215035  211197  260425  346159
summary(date_chi$Structure_Cost)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   69522   93099  120706  137419  187915  239964
sd_home_chi <- sd(date_chi$Home_Value, na.rm = TRUE)
sd_cost_chi <- sd(date_chi$Structure_Cost, na.rm = TRUE)

sd_home_chi / mean(date_chi$Home_Value, na.rm = TRUE)
## [1] 0.3232225
sd_cost_chi / mean(date_chi$Structure_Cost, na.rm = TRUE)
## [1] 0.3821368
# Ambii coeficienti sunt mai mici de 0.35 (35%), deci datele sunt destul de omogene, 
# iar media preturilor si a costurilor este reprezentativa pentru Chicago

skewness(date_chi$Home_Value, na.rm = TRUE)
## [1] 0.1746737
skewness(date_chi$Structure_Cost, na.rm = TRUE)
## [1] 0.4906416
kurtosis(date_chi$Home_Value, na.rm = TRUE)
## [1] 2.032091
kurtosis(date_chi$Structure_Cost, na.rm = TRUE)
## [1] 1.872484
# Ambele valori sunt < 3, deci distributiile sunt platicurtice

# grafice distributii Chicago
par(mfrow = c(2, 2))
hist(date_chi$Home_Value, col = "seagreen", main = "Histograma valoare casa - Chicago", xlab = "Valoare casa")
boxplot(date_chi$Home_Value, col = "seagreen", horizontal = TRUE, main = "Boxplot valoare casa - Chicago")
hist(date_chi$Structure_Cost, col = "orange", main = "Histograma cost structura - Chicago", xlab = "Cost structura")
boxplot(date_chi$Structure_Cost, col = "orange", horizontal = TRUE, main = "Boxplot cost structura - Chicago")

# Boxplot-urile nu arata outlieri importanti pe cost sau valoare

# regresie liniara si neliniara Chicago
m1_chi <- lm(date_chi$Structure_Cost ~ date_chi$Home_Value)
summary(m1_chi)
## 
## Call:
## lm(formula = date_chi$Structure_Cost ~ date_chi$Home_Value)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -47502 -16334  -9395   1149  70201 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         1.134e+04  9.667e+03   1.173    0.243    
## date_chi$Home_Value 5.970e-01  4.357e-02  13.702   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 33250 on 124 degrees of freedom
## Multiple R-squared:  0.6022, Adjusted R-squared:  0.599 
## F-statistic: 187.8 on 1 and 124 DF,  p-value: < 2.2e-16
# R-squared este de 0.6022 (60.22%), deci 60.22% din variatia costului structural este explicata de valoarea casei
# La o crestere cu 1 a valorii locuintelor, costul structural creste cu 5.970e-01

m2_chi <- lm(log(date_chi$Structure_Cost) ~ log(date_chi$Home_Value))
summary(m2_chi)
## 
## Call:
## lm(formula = log(date_chi$Structure_Cost) ~ log(date_chi$Home_Value))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.24910 -0.12245 -0.05806  0.09441  0.39835 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -0.03949    0.60118  -0.066    0.948    
## log(date_chi$Home_Value)  0.96669    0.04924  19.633   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1888 on 124 degrees of freedom
## Multiple R-squared:  0.7566, Adjusted R-squared:  0.7546 
## F-statistic: 385.5 on 1 and 124 DF,  p-value: < 2.2e-16
# R-squared este de 0.7566 (75.66%), deci 75.66% din variatia costului structural este explicata de valoarea casei
# La o crestere cu 1 a valorii locuintelor, costul structural creste cu 0.96669

# reprezentare regresie Chicago
par(mfrow = c(1, 1))
plot(date_chi$Home_Value, date_chi$Structure_Cost, col = "darkgreen", main = "Regresie liniara Chicago", xlab = "Valoare casa", ylab = "Cost structura")
abline(m1_chi, col = "red", lwd = 2)

# analiza erori model log-log Chicago
erori_chi <- m2_chi$residuals
mean(erori_chi)
## [1] -2.89482e-17
sd(erori_chi)
## [1] 0.1880798
skewness(erori_chi)
## [1] 0.802925
kurtosis(erori_chi)
## [1] 2.486434
# Media erorilor este practic 0
# Skewness-ul ne arata ca distributia erorilor este destul de simetrica

par(mfrow = c(1, 2))
hist(erori_chi, col = "grey", main = "Hist. reziduuri - Chicago", xlab = "Reziduuri")
boxplot(erori_chi, col = "grey", horizontal = TRUE, main = "Boxplot reziduuri - Chicago")

# statistici descriptive San Francisco
summary(date_sf$Home_Value)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  235636  406116  694572  692200  909208 1346489
summary(date_sf$Structure_Cost)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   41935   73330  104522  122310  177796  257243
sd_home_sf <- sd(date_sf$Home_Value, na.rm = TRUE)
sd_cost_sf <- sd(date_sf$Structure_Cost, na.rm = TRUE)

sd_home_sf / mean(date_sf$Home_Value, na.rm = TRUE)
## [1] 0.4802773
sd_cost_sf / mean(date_sf$Structure_Cost, na.rm = TRUE)
## [1] 0.5025894
# Ambii coeficienti sunt sub 35%, deci seriile de date sunt omogene si mediile sunt reprezentative

skewness(date_sf$Home_Value, na.rm = TRUE)
## [1] 0.4319539
skewness(date_sf$Structure_Cost, na.rm = TRUE)
## [1] 0.6024737
# Avem asimetrie la dreapta

kurtosis(date_sf$Home_Value, na.rm = TRUE)
## [1] 1.892514
kurtosis(date_sf$Structure_Cost, na.rm = TRUE)
## [1] 2.135844
# Valorile < 3 ne arata distributii aplatizate (platicurtice)

# grafice distributii San Francisco
par(mfrow = c(2, 2))
hist(date_sf$Home_Value, col = "purple", main = "Histograma valoare casa - San Francisco", xlab = "Valoare casa")
boxplot(date_sf$Home_Value, col = "purple", horizontal = TRUE, main = "Boxplot valoare casa - San Francisco")
hist(date_sf$Structure_Cost, col = "pink", main = "Histograma cost structura - San Francisco", xlab = "Cost structura")
boxplot(date_sf$Structure_Cost, col = "pink", horizontal = TRUE, main = "Boxplot cost structura - San Francisco")

# Boxplot-urile arata lipsa outlierilor, distributiile fiind curate si stabile in timp

# regresie liniara si neliniara San Francisco
m1_sf <- lm(date_sf$Structure_Cost ~ date_sf$Home_Value)
summary(m1_sf)
## 
## Call:
## lm(formula = date_sf$Structure_Cost ~ date_sf$Home_Value)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -71698 -17403  -6231  11949  73480 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        1.632e+04  7.142e+03   2.285    0.024 *  
## date_sf$Home_Value 1.531e-01  9.308e-03  16.451   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 34600 on 124 degrees of freedom
## Multiple R-squared:  0.6858, Adjusted R-squared:  0.6832 
## F-statistic: 270.6 on 1 and 124 DF,  p-value: < 2.2e-16
# R-squared este de 0.6858 (68.58%), deci 68.58% din variatia costului structural este explicata de valoarea casei
# p-value este foarte mic (<0.05), deci relatia liniara este semnificativa statistic
# La o crestere cu 1 a valorii locuintelor, costul structural creste cu 1.531e-01

m2_sf <- lm(log(date_sf$Structure_Cost) ~ log(date_sf$Home_Value))
summary(m2_sf)
## 
## Call:
## lm(formula = log(date_sf$Structure_Cost) ~ log(date_sf$Home_Value))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.40725 -0.15074 -0.02867  0.12825  0.44538 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             -0.77721    0.50104  -1.551    0.123    
## log(date_sf$Home_Value)  0.92777    0.03757  24.693   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2123 on 124 degrees of freedom
## Multiple R-squared:  0.831,  Adjusted R-squared:  0.8296 
## F-statistic: 609.8 on 1 and 124 DF,  p-value: < 2.2e-16
# R-squared este de 0.831 (83.1%), deci 83.1% din variatia costului structural este explicata de valoarea casei
# p-value este foarte mic (<0.05), deci relatia liniara este semnificativa statistic
# La o crestere cu 1 a valorii locuintelor, costul structural creste cu 0.92777

# reprezentare regresie San Francisco
par(mfrow = c(1, 1))
plot(date_sf$Home_Value, date_sf$Structure_Cost, col = "darkblue", main = "Regresie liniara San Francisco", xlab = "Valoare casa", ylab = "Cost structura")
abline(m1_sf, col = "red", lwd = 2)

# analiza erori model log-log San Francisco
erori_sf <- m2_sf$residuals
mean(erori_sf)
## [1] -1.29631e-17
sd(erori_sf)
## [1] 0.2114198
skewness(erori_sf)
## [1] 0.1833915
kurtosis(erori_sf)
## [1] 2.382807
# Media erorilor este extrem de aproape de 0, respectand ipotezele modelului
# Analiza descriptiva a erorilor arata ca acestea sunt distribuite simetric si fara extreme

par(mfrow = c(1, 2))
hist(erori_sf, col = "grey", main = "Hist. reziduuri - San Francisco", xlab = "Reziduuri")
boxplot(erori_sf, col = "grey", horizontal = TRUE, main = "Boxplot reziduuri - San Francisco")