1. Chargement des données
df <- read.csv("../data/train.csv")
cat("Dimensions :", nrow(df), "maisons x", ncol(df), "variables")
## Dimensions : 1460 maisons x 81 variables
2. Distribution des prix
ggplot(df, aes(x = SalePrice)) +
geom_histogram(bins = 50, fill = "steelblue", color = "white") +
labs(title = "Distribution du prix de vente",
x = "Prix ($)", y = "Nombre de maisons") +
theme_minimal()

3. Prix par quartier
df %>%
group_by(Neighborhood) %>%
summarise(median_price = median(SalePrice)) %>%
arrange(desc(median_price)) %>%
ggplot(aes(x = reorder(Neighborhood, median_price), y = median_price, fill = median_price)) +
geom_col() +
scale_fill_gradient(low = "lightblue", high = "darkblue") +
coord_flip() +
labs(title = "Prix médian par quartier",
x = "Quartier", y = "Prix médian ($)") +
theme_minimal() +
theme(legend.position = "none")

4. Prix selon la qualité globale
ggplot(df, aes(x = factor(OverallQual), y = SalePrice, fill = factor(OverallQual))) +
geom_boxplot() +
scale_fill_brewer(palette = "RdYlBu") +
labs(title = "Prix selon la qualité globale",
x = "Qualité (1-10)", y = "Prix ($)") +
theme_minimal() +
theme(legend.position = "none")

5. Superficie vs Prix
ggplot(df, aes(x = GrLivArea, y = SalePrice)) +
geom_point(alpha = 0.4, color = "steelblue") +
geom_smooth(method = "lm", color = "red", se = FALSE) +
labs(title = "Prix selon la superficie",
x = "Superficie (pieds carrés)", y = "Prix ($)") +
theme_minimal()
