1. Chargement des données

df <- read.csv("../data/train.csv")
cat("Dimensions :", nrow(df), "maisons x", ncol(df), "variables")
## Dimensions : 1460 maisons x 81 variables

2. Distribution des prix

ggplot(df, aes(x = SalePrice)) +
  geom_histogram(bins = 50, fill = "steelblue", color = "white") +
  labs(title = "Distribution du prix de vente",
       x = "Prix ($)", y = "Nombre de maisons") +
  theme_minimal()

3. Prix par quartier

df %>%
  group_by(Neighborhood) %>%
  summarise(median_price = median(SalePrice)) %>%
  arrange(desc(median_price)) %>%
  ggplot(aes(x = reorder(Neighborhood, median_price), y = median_price, fill = median_price)) +
  geom_col() +
  scale_fill_gradient(low = "lightblue", high = "darkblue") +
  coord_flip() +
  labs(title = "Prix médian par quartier",
       x = "Quartier", y = "Prix médian ($)") +
  theme_minimal() +
  theme(legend.position = "none")

4. Prix selon la qualité globale

ggplot(df, aes(x = factor(OverallQual), y = SalePrice, fill = factor(OverallQual))) +
  geom_boxplot() +
  scale_fill_brewer(palette = "RdYlBu") +
  labs(title = "Prix selon la qualité globale",
       x = "Qualité (1-10)", y = "Prix ($)") +
  theme_minimal() +
  theme(legend.position = "none")

5. Superficie vs Prix

ggplot(df, aes(x = GrLivArea, y = SalePrice)) +
  geom_point(alpha = 0.4, color = "steelblue") +
  geom_smooth(method = "lm", color = "red", se = FALSE) +
  labs(title = "Prix selon la superficie",
       x = "Superficie (pieds carrés)", y = "Prix ($)") +
  theme_minimal()