#rm(list=ls())
options(repos = c(CRAN = "https://cran.rstudio.com/"))
#Zunächst gilt es die nötigen Pakete zu laden.
install.packages("ggplot2")
## Installiere Paket nach 'C:/Users/DELL/AppData/Local/R/win-library/4.4'
## (da 'lib' nicht spezifiziert)
## Paket 'ggplot2' erfolgreich ausgepackt und MD5 Summen abgeglichen
## 
## Die heruntergeladenen Binärpakete sind in 
##  C:\Users\DELL\AppData\Local\Temp\RtmpIt3gkO\downloaded_packages
install.packages("dplyr")
## Installiere Paket nach 'C:/Users/DELL/AppData/Local/R/win-library/4.4'
## (da 'lib' nicht spezifiziert)
## Paket 'dplyr' erfolgreich ausgepackt und MD5 Summen abgeglichen
## Warning: kann alte Installation von Paket 'dplyr' nicht entfernen
## Warning in file.copy(savedcopy, lib, recursive = TRUE): Problem
## C:\Users\DELL\AppData\Local\R\win-library\4.4\00LOCK\dplyr\libs\x64\dplyr.dll
## nach C:\Users\DELL\AppData\Local\R\win-library\4.4\dplyr\libs\x64\dplyr.dll zu
## kopieren: Permission denied
## Warning: 'dplyr' wiederhergestellt
## 
## Die heruntergeladenen Binärpakete sind in 
##  C:\Users\DELL\AppData\Local\Temp\RtmpIt3gkO\downloaded_packages
install.packages("randomForest")
## Installiere Paket nach 'C:/Users/DELL/AppData/Local/R/win-library/4.4'
## (da 'lib' nicht spezifiziert)
## Paket 'randomForest' erfolgreich ausgepackt und MD5 Summen abgeglichen
## Warning: kann alte Installation von Paket 'randomForest' nicht entfernen
## Warning in file.copy(savedcopy, lib, recursive = TRUE): Problem
## C:\Users\DELL\AppData\Local\R\win-library\4.4\00LOCK\randomForest\libs\x64\randomForest.dll
## nach
## C:\Users\DELL\AppData\Local\R\win-library\4.4\randomForest\libs\x64\randomForest.dll
## zu kopieren: Permission denied
## Warning: 'randomForest' wiederhergestellt
## 
## Die heruntergeladenen Binärpakete sind in 
##  C:\Users\DELL\AppData\Local\Temp\RtmpIt3gkO\downloaded_packages
install.packages("performance")
## Installiere Paket nach 'C:/Users/DELL/AppData/Local/R/win-library/4.4'
## (da 'lib' nicht spezifiziert)
## Paket 'performance' erfolgreich ausgepackt und MD5 Summen abgeglichen
## 
## Die heruntergeladenen Binärpakete sind in 
##  C:\Users\DELL\AppData\Local\Temp\RtmpIt3gkO\downloaded_packages
install.packages("skimr")
## Installiere Paket nach 'C:/Users/DELL/AppData/Local/R/win-library/4.4'
## (da 'lib' nicht spezifiziert)
## Paket 'skimr' erfolgreich ausgepackt und MD5 Summen abgeglichen
## 
## Die heruntergeladenen Binärpakete sind in 
##  C:\Users\DELL\AppData\Local\Temp\RtmpIt3gkO\downloaded_packages
install.packages("corrplot")
## Installiere Paket nach 'C:/Users/DELL/AppData/Local/R/win-library/4.4'
## (da 'lib' nicht spezifiziert)
## Paket 'corrplot' erfolgreich ausgepackt und MD5 Summen abgeglichen
## 
## Die heruntergeladenen Binärpakete sind in 
##  C:\Users\DELL\AppData\Local\Temp\RtmpIt3gkO\downloaded_packages
install.packages("GGally")
## Installiere Paket nach 'C:/Users/DELL/AppData/Local/R/win-library/4.4'
## (da 'lib' nicht spezifiziert)
## Paket 'GGally' erfolgreich ausgepackt und MD5 Summen abgeglichen
## 
## Die heruntergeladenen Binärpakete sind in 
##  C:\Users\DELL\AppData\Local\Temp\RtmpIt3gkO\downloaded_packages
install.packages("patchwork")
## Installiere Paket nach 'C:/Users/DELL/AppData/Local/R/win-library/4.4'
## (da 'lib' nicht spezifiziert)
## Paket 'patchwork' erfolgreich ausgepackt und MD5 Summen abgeglichen
## 
## Die heruntergeladenen Binärpakete sind in 
##  C:\Users\DELL\AppData\Local\Temp\RtmpIt3gkO\downloaded_packages
install.packages("scales")
## Installiere Paket nach 'C:/Users/DELL/AppData/Local/R/win-library/4.4'
## (da 'lib' nicht spezifiziert)
## Paket 'scales' erfolgreich ausgepackt und MD5 Summen abgeglichen
## 
## Die heruntergeladenen Binärpakete sind in 
##  C:\Users\DELL\AppData\Local\Temp\RtmpIt3gkO\downloaded_packages
install.packages("ggdist")
## Installiere Paket nach 'C:/Users/DELL/AppData/Local/R/win-library/4.4'
## (da 'lib' nicht spezifiziert)
## Paket 'ggdist' erfolgreich ausgepackt und MD5 Summen abgeglichen
## Warning: kann alte Installation von Paket 'ggdist' nicht entfernen
## Warning in file.copy(savedcopy, lib, recursive = TRUE): Problem
## C:\Users\DELL\AppData\Local\R\win-library\4.4\00LOCK\ggdist\libs\x64\ggdist.dll
## nach C:\Users\DELL\AppData\Local\R\win-library\4.4\ggdist\libs\x64\ggdist.dll
## zu kopieren: Permission denied
## Warning: 'ggdist' wiederhergestellt
## 
## Die heruntergeladenen Binärpakete sind in 
##  C:\Users\DELL\AppData\Local\Temp\RtmpIt3gkO\downloaded_packages
#Gleiches für die Biblotheken  
library(ggplot2)
library(dplyr)
## 
## Attache Paket: 'dplyr'
## Die folgenden Objekte sind maskiert von 'package:stats':
## 
##     filter, lag
## Die folgenden Objekte sind maskiert von 'package:base':
## 
##     intersect, setdiff, setequal, union
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attache Paket: 'randomForest'
## Das folgende Objekt ist maskiert 'package:dplyr':
## 
##     combine
## Das folgende Objekt ist maskiert 'package:ggplot2':
## 
##     margin
library(performance)
library(skimr)
library(corrplot)
## corrplot 0.95 loaded
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(patchwork)
library(scales)
library(ggdist)
getwd()
## [1] "C:/Users/DELL/OneDrive/Dokumente/01_03_RStudio/HousePricePrediction/Projektarbeit"
#Datensatz umfasst 545 Objekte aus 13 Variablen
housing <- read.csv("Housing.csv")
#Struktur der Daten 
#Erste Einblicke in numerische und kategoriale Variablen 
str(housing)
## 'data.frame':    545 obs. of  13 variables:
##  $ price           : int  13300000 12250000 12250000 12215000 11410000 10850000 10150000 10150000 9870000 9800000 ...
##  $ area            : int  7420 8960 9960 7500 7420 7500 8580 16200 8100 5750 ...
##  $ bedrooms        : int  4 4 3 4 4 3 4 5 4 3 ...
##  $ bathrooms       : int  2 4 2 2 1 3 3 3 1 2 ...
##  $ stories         : int  3 4 2 2 2 1 4 2 2 4 ...
##  $ mainroad        : chr  "yes" "yes" "yes" "yes" ...
##  $ guestroom       : chr  "no" "no" "no" "no" ...
##  $ basement        : chr  "no" "no" "yes" "yes" ...
##  $ hotwaterheating : chr  "no" "no" "no" "no" ...
##  $ airconditioning : chr  "yes" "yes" "no" "yes" ...
##  $ parking         : int  2 3 2 3 2 2 2 0 2 1 ...
##  $ prefarea        : chr  "yes" "no" "yes" "yes" ...
##  $ furnishingstatus: chr  "furnished" "furnished" "semi-furnished" "furnished" ...
#Zusammenfassung der Daten
summary(housing)
##      price               area          bedrooms       bathrooms    
##  Min.   : 1750000   Min.   : 1650   Min.   :1.000   Min.   :1.000  
##  1st Qu.: 3430000   1st Qu.: 3600   1st Qu.:2.000   1st Qu.:1.000  
##  Median : 4340000   Median : 4600   Median :3.000   Median :1.000  
##  Mean   : 4766729   Mean   : 5151   Mean   :2.965   Mean   :1.286  
##  3rd Qu.: 5740000   3rd Qu.: 6360   3rd Qu.:3.000   3rd Qu.:2.000  
##  Max.   :13300000   Max.   :16200   Max.   :6.000   Max.   :4.000  
##     stories        mainroad          guestroom           basement        
##  Min.   :1.000   Length:545         Length:545         Length:545        
##  1st Qu.:1.000   Class :character   Class :character   Class :character  
##  Median :2.000   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :1.806                                                           
##  3rd Qu.:2.000                                                           
##  Max.   :4.000                                                           
##  hotwaterheating    airconditioning       parking         prefarea        
##  Length:545         Length:545         Min.   :0.0000   Length:545        
##  Class :character   Class :character   1st Qu.:0.0000   Class :character  
##  Mode  :character   Mode  :character   Median :0.0000   Mode  :character  
##                                        Mean   :0.6936                     
##                                        3rd Qu.:1.0000                     
##                                        Max.   :3.0000                     
##  furnishingstatus  
##  Length:545        
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
#Die weitere Arbeit erfolgt nun mit dem Datensatz "h1". 
h1 <- housing
#Überprüfung auf fehlende Daten
colSums(is.na(h1))
##            price             area         bedrooms        bathrooms 
##                0                0                0                0 
##          stories         mainroad        guestroom         basement 
##                0                0                0                0 
##  hotwaterheating  airconditioning          parking         prefarea 
##                0                0                0                0 
## furnishingstatus 
##                0
#Zusammenfassung und Kontrolle der Datenqualität
library(skimr)
h1_skim_tibble <- skim(h1) %>% tibble::as_tibble()
print(h1_skim_tibble)
## # A tibble: 13 × 17
##    skim_type skim_variable   n_missing complete_rate character.min character.max
##    <chr>     <chr>               <int>         <dbl>         <int>         <int>
##  1 character mainroad                0             1             2             3
##  2 character guestroom               0             1             2             3
##  3 character basement                0             1             2             3
##  4 character hotwaterheating         0             1             2             3
##  5 character airconditioning         0             1             2             3
##  6 character prefarea                0             1             2             3
##  7 character furnishingstat…         0             1             9            14
##  8 numeric   price                   0             1            NA            NA
##  9 numeric   area                    0             1            NA            NA
## 10 numeric   bedrooms                0             1            NA            NA
## 11 numeric   bathrooms               0             1            NA            NA
## 12 numeric   stories                 0             1            NA            NA
## 13 numeric   parking                 0             1            NA            NA
## # ℹ 11 more variables: character.empty <int>, character.n_unique <int>,
## #   character.whitespace <int>, numeric.mean <dbl>, numeric.sd <dbl>,
## #   numeric.p0 <dbl>, numeric.p25 <dbl>, numeric.p50 <dbl>, numeric.p75 <dbl>,
## #   numeric.p100 <dbl>, numeric.hist <chr>
# Typen der Variablen anpassen 
h1$mainroad <- as.factor(h1$mainroad)
h1$guestroom <- as.factor(h1$guestroom)
h1$basement <- as.factor(h1$basement)
h1$hotwaterheating <- as.factor(h1$hotwaterheating)
h1$airconditioning <- as.factor(h1$airconditioning)
h1$prefarea <- as.factor(h1$prefarea)
h1$furnishingstatus <- as.factor(h1$furnishingstatus)
#Struktur der Daten 
#Erneute Einblicke in numerische und kategoriale Variablen 
str(h1)
## 'data.frame':    545 obs. of  13 variables:
##  $ price           : int  13300000 12250000 12250000 12215000 11410000 10850000 10150000 10150000 9870000 9800000 ...
##  $ area            : int  7420 8960 9960 7500 7420 7500 8580 16200 8100 5750 ...
##  $ bedrooms        : int  4 4 3 4 4 3 4 5 4 3 ...
##  $ bathrooms       : int  2 4 2 2 1 3 3 3 1 2 ...
##  $ stories         : int  3 4 2 2 2 1 4 2 2 4 ...
##  $ mainroad        : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ guestroom       : Factor w/ 2 levels "no","yes": 1 1 1 1 2 1 1 1 2 2 ...
##  $ basement        : Factor w/ 2 levels "no","yes": 1 1 2 2 2 2 1 1 2 1 ...
##  $ hotwaterheating : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ airconditioning : Factor w/ 2 levels "no","yes": 2 2 1 2 2 2 2 1 2 2 ...
##  $ parking         : int  2 3 2 3 2 2 2 0 2 1 ...
##  $ prefarea        : Factor w/ 2 levels "no","yes": 2 1 2 2 1 2 2 1 2 2 ...
##  $ furnishingstatus: Factor w/ 3 levels "furnished","semi-furnished",..: 1 1 2 1 1 2 2 3 1 3 ...
#Zusammenfassung der "neuen" Daten
summary(h1)
##      price               area          bedrooms       bathrooms    
##  Min.   : 1750000   Min.   : 1650   Min.   :1.000   Min.   :1.000  
##  1st Qu.: 3430000   1st Qu.: 3600   1st Qu.:2.000   1st Qu.:1.000  
##  Median : 4340000   Median : 4600   Median :3.000   Median :1.000  
##  Mean   : 4766729   Mean   : 5151   Mean   :2.965   Mean   :1.286  
##  3rd Qu.: 5740000   3rd Qu.: 6360   3rd Qu.:3.000   3rd Qu.:2.000  
##  Max.   :13300000   Max.   :16200   Max.   :6.000   Max.   :4.000  
##     stories      mainroad  guestroom basement  hotwaterheating airconditioning
##  Min.   :1.000   no : 77   no :448   no :354   no :520         no :373        
##  1st Qu.:1.000   yes:468   yes: 97   yes:191   yes: 25         yes:172        
##  Median :2.000                                                                
##  Mean   :1.806                                                                
##  3rd Qu.:2.000                                                                
##  Max.   :4.000                                                                
##     parking       prefarea        furnishingstatus
##  Min.   :0.0000   no :417   furnished     :140    
##  1st Qu.:0.0000   yes:128   semi-furnished:227    
##  Median :0.0000             unfurnished   :178    
##  Mean   :0.6936                                   
##  3rd Qu.:1.0000                                   
##  Max.   :3.0000
#zusammenfassung der numerischen Variablenn
summary(h1[c("price", "area", "bedrooms", "bathrooms", "stories", "parking")])
##      price               area          bedrooms       bathrooms    
##  Min.   : 1750000   Min.   : 1650   Min.   :1.000   Min.   :1.000  
##  1st Qu.: 3430000   1st Qu.: 3600   1st Qu.:2.000   1st Qu.:1.000  
##  Median : 4340000   Median : 4600   Median :3.000   Median :1.000  
##  Mean   : 4766729   Mean   : 5151   Mean   :2.965   Mean   :1.286  
##  3rd Qu.: 5740000   3rd Qu.: 6360   3rd Qu.:3.000   3rd Qu.:2.000  
##  Max.   :13300000   Max.   :16200   Max.   :6.000   Max.   :4.000  
##     stories         parking      
##  Min.   :1.000   Min.   :0.0000  
##  1st Qu.:1.000   1st Qu.:0.0000  
##  Median :2.000   Median :0.0000  
##  Mean   :1.806   Mean   :0.6936  
##  3rd Qu.:2.000   3rd Qu.:1.0000  
##  Max.   :4.000   Max.   :3.0000
#Datensatz der numerischen Variablen
h1_num <- select_if(h1, is.numeric)
#Korrelationsmatrix der numerischen Varablen I 
cor(h1_num)
##               price       area  bedrooms bathrooms    stories    parking
## price     1.0000000 0.53599735 0.3664940 0.5175453 0.42071237 0.38439365
## area      0.5359973 1.00000000 0.1518585 0.1938195 0.08399605 0.35298048
## bedrooms  0.3664940 0.15185849 1.0000000 0.3739302 0.40856424 0.13926990
## bathrooms 0.5175453 0.19381953 0.3739302 1.0000000 0.32616471 0.17749582
## stories   0.4207124 0.08399605 0.4085642 0.3261647 1.00000000 0.04554709
## parking   0.3843936 0.35298048 0.1392699 0.1774958 0.04554709 1.00000000
h1_corr_mat <- round(cor(h1_num), 2)
#Korrelationsmatrix der numerischen Varablen II 
#cl.pos = "n" entfernt Legende)
corrplot(h1_corr_mat, method = "color", title = "Korrelationsmatrix", addCoef.col = "black", number.cex = 0.8,  tl.col = "black",   tl.srt = 45,tl.cex = 1, mar = c(1, 1, 2, 1), cl.pos = "n" )

#library(GGally)
GGally::ggpairs(h1_num)

#Wenn man den Plot in die Console einträgt, erhält man diesen rechts unten in "Plots" und kann abspeichern" 
#Streudiagramm: Fläche vs. Preis
ggplot(h1, aes(x = area, y = price)) +
  geom_point(color = "skyblue") +
  theme_minimal() +
  scale_y_continuous(labels = dollar_format()) +
  theme(plot.title = element_text(hjust = 0, size = 10, face = "bold"), 
        axis.line.x = element_line(color = "black", size = 0.2),axis.line.y = element_line(color = "black", size = 0.2),
        axis.text.x = element_text(size = 10, color = "black"), axis.text.y = element_text(size = 10, color = "black"),
        legend.text = element_text(size = 10), legend.title = element_text(size = 10)) +
  labs(title = "Scatter Plot of Area vs. Price", x = "", y = "")
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#Weitere Analysen zu "furnishingstatus"
#Barplot für Möblierungsstatus
library(ggplot2)
ggplot(h1, aes(x = furnishingstatus, fill = furnishingstatus)) +
  geom_bar() +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0, size = 10, face = "bold"), 
        axis.line.x = element_line(color = "black", size = 0.2),axis.line.y = element_line(color = "black", size = 0.2),
        axis.text.x = element_text(size = 10, color = "black"), axis.text.y = element_text(size = 10, color = "black"),
        legend.position='none')+
  labs(title = "Distribution of Furnishing Status", x = "", y = "")

#Zusammenfassung pro Möblierungsstatus nach price und area
h1 %>%
  group_by(furnishingstatus) %>%
  summarise(
    count = n(),
    mean_price = mean(price),
    median_price = median(price),
    sd_price = sd(price),
    mean_area = mean(area),
    median_area = median(area),
    sd_area = sd(area)
  )
## # A tibble: 3 × 8
##   furnishingstatus count mean_price median_price sd_price mean_area median_area
##   <fct>            <int>      <dbl>        <dbl>    <dbl>     <dbl>       <dbl>
## 1 furnished          140   5495696       5075000 2117857.     5688.        5800
## 2 semi-furnished     227   4907524.      4585000 1596688.     5166.        4600
## 3 unfurnished        178   4013831.      3430000 1720247.     4708.        4075
## # ℹ 1 more variable: sd_area <dbl>
#Boxplot: Preise nach Möblierungsstatus

ggplot(h1, aes(x = furnishingstatus, y = price, fill = furnishingstatus)) +
  geom_boxplot()  + 
  scale_y_continuous(labels = dollar_format()) +
  geom_hline(aes(yintercept=mean(price)), colour = "red", linetype= "dashed", lwd=0.5) + 
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0, size = 10, face = "bold"), 
        axis.line.x = element_line(color = "black", size = 0.2),axis.line.y = element_line(color = "black", size = 0.2),
        axis.text.x = element_text(size = 10, color = "black"), axis.text.y = element_text(size = 10, color = "black"),
        legend.position='none')+
  labs(title = "Housing Prices by Furnishing Status", x = "", y = "")

#Boxplot: Fläche nach Möblierungsstatus
ggplot(data=h1, mapping = aes(x=furnishingstatus, y=area))+
  ggdist::stat_halfeye(aes(slab_colour=furnishingstatus),adjust=0.6,justification=-0.25,.width=0,point_colour= NA)+
 stat_boxplot(geom ='errorbar', width=0.4, position=position_dodge(width=0.1),alpha=0.5) +
  geom_boxplot(aes(x=furnishingstatus, y=area,fill=furnishingstatus),
               width=0.4,position=position_dodge(width=0.1),show.legend = FALSE)+
  geom_jitter(color="black",size=1,position = position_jitter(w = 0.1, h = 0.1))+
labs(title="Housing Area by Furnishing Status",
       x ="", y = "") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0, size = 10, face = "bold"), 
        axis.line.x = element_line(color = "black", size = 0.2),axis.line.y = element_line(color = "black", size = 0.2),
        axis.text.x = element_text(size = 10, color = "black"), axis.text.y = element_text(size = 10, color = "black"),
        legend.position='none')

#Boxplot: Preis nach Möblierungsstatus
ggplot(data=h1, mapping = aes(x=furnishingstatus, y=price))+
  ggdist::stat_halfeye(aes(slab_colour=furnishingstatus),adjust=0.6,justification=-0.25,.width=0,point_colour= NA)+
 stat_boxplot(geom ='errorbar', width=0.4, position=position_dodge(width=0.1),alpha=0.5) +
  scale_y_continuous(labels = dollar_format()) +
  geom_boxplot(aes(x=furnishingstatus, y=price,fill=furnishingstatus),
               width=0.4,position=position_dodge(width=0.1),show.legend = FALSE)+
  geom_jitter(color="black",size=1,position = position_jitter(w = 0.1, h = 0.1))+
labs(title="Housing Prices by Furnishing Status",
       x ="", y = "") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0, size = 10, face = "bold"), 
        axis.line.x = element_line(color = "black", size = 0.2),axis.line.y = element_line(color = "black", size = 0.2),
        axis.text.x = element_text(size = 10, color = "black"), axis.text.y = element_text(size = 10, color = "black"),
        legend.position='none')

#Scatterplot: Preis vs. Fläche - gefärbt nach Möblierungsstatus
ggplot(h1, aes(x = area, y = price, color = furnishingstatus)) +
  geom_point() +
  scale_y_continuous(labels = dollar_format()) +
 theme_minimal() +
  theme(plot.title = element_text(hjust = 0, size = 10, face = "bold"), 
        axis.line.x = element_line(color = "black", size = 0.2),axis.line.y = element_line(color = "black", size = 0.2),
        axis.text.x = element_text(size = 10, color = "black"), axis.text.y = element_text(size = 10, color = "black"),
        legend.text = element_text(size = 10), legend.title = element_text(size = 10))+
  labs(title = "Scatter Plot of Price vs. Area by Furnishing Status", x = "", y = "")

#Weiter mit "price"
#Histogramm der Immobilienpreise
ggplot(h1, aes(x = price)) +
  geom_histogram(binwidth = 500000, fill = "skyblue", color = "black") +
  scale_x_continuous(labels = dollar_format()) +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0, size = 10, face = "bold"),
        axis.line.x = element_line(color = "black", size = 0.2),axis.line.y = element_line(color = "black", size = 0.2),
        axis.text.x = element_text(size = 10, color = "black"), axis.text.y = element_text(size = 10, color = "black"))+
  labs(title = "Histogram of Housing Prices", x = "", y = "")

#Weitere EDA zu "mainroad"
#Zusammenfassung pro mainroad nach price und area
h1 %>%
  group_by(mainroad) %>%
  summarise(
    count = n(),
    mean_price = mean(price),
    median_price = median(price),
    sd_price = sd(price),
    mean_area = mean(area),
    median_area = median(area),
    sd_area = sd(area)
  )
## # A tibble: 2 × 8
##   mainroad count mean_price median_price sd_price mean_area median_area sd_area
##   <fct>    <int>      <dbl>        <dbl>    <dbl>     <dbl>       <dbl>   <dbl>
## 1 no          77   3398905.      3290000  894735.     3606.        3410   1185.
## 2 yes        468   4991777.      4550000 1893639.     5405.        4975   2191.
# Durchschnittspreise nach furnishingstatus und mainroad 
h1 %>%
  group_by(furnishingstatus, mainroad) %>%
  summarise(
    count = n(),
    mean_price = mean(price),
    median_price = median(price),
    sd_price = sd(price)
    )
## `summarise()` has grouped output by 'furnishingstatus'. You can override using
## the `.groups` argument.
## # A tibble: 6 × 6
## # Groups:   furnishingstatus [3]
##   furnishingstatus mainroad count mean_price median_price sd_price
##   <fct>            <fct>    <int>      <dbl>        <dbl>    <dbl>
## 1 furnished        no           9   3412111.      3353000  998924.
## 2 furnished        yes        131   5638843.      5250000 2100769.
## 3 semi-furnished   no          31   3796247.      3815000  891339.
## 4 semi-furnished   yes        196   5083288.      4690000 1614032.
## 5 unfurnished      no          37   3062784.      2975000  744397.
## 6 unfurnished      yes        141   4263397.      3500000 1815785.
ggplot(h1, aes(x = area, y = price, color = mainroad)) +
  geom_point() +
  scale_y_continuous(labels = dollar_format()) +
  theme_minimal() +
   theme(plot.title = element_text(hjust = 0, size = 10, face = "bold"), 
        axis.line.x = element_line(color = "black", size = 0.2),axis.line.y = element_line(color = "black", size = 0.2),
        axis.text.x = element_text(size = 10, color = "black"), axis.text.y = element_text(size = 10, color = "black"),
        legend.text = element_text(size = 10), legend.title = element_text(size = 10)) +
  labs(title = "Scatter Plot of Price vs. Area by Main Road Access", x = "", y = "") 

#Restliche Attribute 
# Boxplot bathrooms
A <- ggplot(h1, aes(x = factor(bathrooms), y = price, fill = factor(bathrooms))) +
  geom_boxplot() +
  scale_y_continuous(labels = function(x) paste0("$ ", x / 1e6, " Mio")) +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0, size = 10, face = "bold"), 
        axis.line.x = element_line(color = "black", size = 0.2),axis.line.y = element_line(color = "black", size = 0.2),
        axis.text.x = element_text(size = 10, color = "black"), axis.text.y = element_text(size = 10, color = "black"),
        legend.position='none') +
labs(title = "Housing Prices by Number of Bathrooms", x = "", y = "") 
# Boxplot stories
B <- ggplot(h1, aes(x = factor(stories), y = price, fill = factor(stories))) +
  geom_boxplot() +
  scale_y_continuous(labels = function(x) paste0("$ ", x / 1e6, " Mio")) +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0, size = 10, face = "bold"), 
        axis.line.x = element_line(color = "black", size = 0.2),axis.line.y = element_line(color = "black", size = 0.2),
        axis.text.x = element_text(size = 10, color = "black"), axis.text.y = element_text(size = 10, color = "black"),
        legend.position='none') +
  labs(title = "Housing Prices by Number of Stories", x = "", y = "")
# Boxplot mainroad
C <- ggplot(h1, aes(x = mainroad, y = price, fill = mainroad)) +
  geom_boxplot() +
  scale_y_continuous(labels = function(x) paste0("$ ", x / 1e6, " Mio")) +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0, size = 10, face = "bold"), 
        axis.line.x = element_line(color = "black", size = 0.2),axis.line.y = element_line(color = "black", size = 0.2),
        axis.text.x = element_text(size = 10, color = "black"), axis.text.y = element_text(size = 10, color = "black"),
        legend.position='none') +
  labs(title = "Housing Prices by Main Road Access", x = "", y = "")
# Boxplot guestroom
D <- ggplot(h1, aes(x = guestroom, y = price, fill = guestroom)) +
  geom_boxplot() +
  scale_y_continuous(labels = function(x) paste0("$ ", x / 1e6, " Mio")) +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0, size = 10, face = "bold"), 
        axis.line.x = element_line(color = "black", size = 0.2),axis.line.y = element_line(color = "black", size = 0.2),
        axis.text.x = element_text(size = 10, color = "black"), axis.text.y = element_text(size = 10, color = "black"),
        legend.position='none') +
  labs(title = "Housing Prices by Guest Room Availability", x = "", y = "")
# Boxplot basement
E <- ggplot(h1, aes(x = basement, y = price, fill = basement)) +
  geom_boxplot() +
  scale_y_continuous(labels = function(x) paste0("$ ", x / 1e6, " Mio")) +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0, size = 10, face = "bold"), 
        axis.line.x = element_line(color = "black", size = 0.2),axis.line.y = element_line(color = "black", size = 0.2),
        axis.text.x = element_text(size = 10, color = "black"), axis.text.y = element_text(size = 10, color = "black"),
        legend.position='none') +
  labs(title = "Housing Prices by Basement Availability", x = "", y = "")
# Boxplot hotwaterheating
F <- ggplot(h1, aes(x = hotwaterheating, y = price, fill = hotwaterheating)) +
  geom_boxplot() +
  scale_y_continuous(labels = function(x) paste0("$ ", x / 1e6, " Mio")) +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0, size = 10, face = "bold"), 
        axis.line.x = element_line(color = "black", size = 0.2),axis.line.y = element_line(color = "black", size = 0.2),
        axis.text.x = element_text(size = 10, color = "black"), axis.text.y = element_text(size = 10, color = "black"),
        legend.position='none') +
  labs(title = "Housing Prices by Hot Water Heating", x = "", y = "")
# Boxplot airconditioning
G <- ggplot(h1, aes(x = airconditioning, y = price, fill = airconditioning)) +
  geom_boxplot() +
  scale_y_continuous(labels = function(x) paste0("$ ", x / 1e6, " Mio")) +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0, size = 10, face = "bold"), 
        axis.line.x = element_line(color = "black", size = 0.2),axis.line.y = element_line(color = "black", size = 0.2),
        axis.text.x = element_text(size = 10, color = "black"), axis.text.y = element_text(size = 10, color = "black"),
        legend.position='none') +
  labs(title = "Housing Prices by Air Conditioning Availability", x = "", y = "")
# Boxplot parking
H <- ggplot(h1, aes(x = factor(parking), y = price, fill = factor(parking))) +
  geom_boxplot() +
  scale_y_continuous(labels = function(x) paste0("$ ", x / 1e6, " Mio")) +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0, size = 10, face = "bold"), 
        axis.line.x = element_line(color = "black", size = 0.2),axis.line.y = element_line(color = "black", size = 0.2),
        axis.text.x = element_text(size = 10, color = "black"), axis.text.y = element_text(size = 10, color = "black"),
        legend.position='none') +
  labs(title = "Housing Prices by Number of Parking Spaces", x = "", y = "")
#Boxplot: bedrooms
I <- ggplot(h1, aes(x = factor(bedrooms), y = price, fill = factor(bedrooms))) +
  geom_boxplot() +
  scale_y_continuous(labels = function(x) paste0("$ ", x / 1e6, " Mio")) +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0, size = 10, face = "bold"), 
        axis.line.x = element_line(color = "black", size = 0.2),axis.line.y = element_line(color = "black", size = 0.2),
        axis.text.x = element_text(size = 10, color = "black"), axis.text.y = element_text(size = 10, color = "black"),
        legend.position='none') +
  labs(title = "Housing Prices by Number of Bedrooms", x = "", y = "")
# Boxplot guestroom
J <- ggplot(h1, aes(x = prefarea, y = price, fill = prefarea)) +
  geom_boxplot() +
  scale_y_continuous(labels = function(x) paste0("$ ", x / 1e6, " Mio")) +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0, size = 10, face = "bold"), 
        axis.line.x = element_line(color = "black", size = 0.2),axis.line.y = element_line(color = "black", size = 0.2),
        axis.text.x = element_text(size = 10, color = "black"), axis.text.y = element_text(size = 10, color = "black"),
        legend.position='none') +
  labs(title = "Housing Prices by preferring area", x = "", y = "")
library(patchwork)
# HIER MUSS NOCH ANPASSUNGEN GEMACHT WERDEN - SIEHE ERSTE DATEI WIE MAN PDFS UND CO ERSTELLT
patch <- (J/C/B/D/H/E/A/F/I/G) + plot_layout(ncol=2,widths=c(1,1))+
  plot_annotation(
  title = 'xx',
  theme = theme(plot.caption = element_text(hjust = 0,size = 10),plot.title = element_text(hjust = 0.5,size = 1)),
  tag_levels = 'I') & 
  theme(plot.tag = element_text(size = 10))
patch