2025-04-28

R Markdown

This is an R Markdown presentation. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document.

Slide with Bullets

  • Bullet 1
  • Bullet 2
  • Bullet 3

Slide with R Output

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Slide with Plot

library(readxl) library(dplyr) library(ggplot2) library(gridExtra)

Load the dataset

data <- read_excel(“C:\Users\abhis\Downloads\AmesHousing.xlsx”)

AmesHousing <- data

Display the first few rows of the data

head(AmesHousing)

1. Column names and data types

cat(“## 1. Column Names and Data Types”) capture.output(str(data), file = “temp_output.md”) cat(readLines(“temp_output.md”), sep = “”) unlink(“temp_output.md”)

2. Missing values per column

cat(“## 2. Missing Values per Column”) missing_values <- colSums(is.na(AmesHousing)) missing_df <- data.frame(Column = names(missing_values), Missing_Count = missing_values) print(missing_df[missing_df$Missing_Count > 0, ])

3. Dimensions of the dataset

cat(“## 3. Dimensions of the Dataset”) dimensions <- dim(data) cat(paste(“Number of rows:”, dimensions[1], “”)) cat(paste(“Number of columns:”, dimensions[2], “”))

4. Summary statistics for numeric columns

cat(“## 4. Summary Statistics for Numeric Columns”) capture.output(summary(AmesHousing[, sapply(AmesHousing, is.numeric)]), file = “temp_output.md”) cat(readLines(“temp_output.md”), sep = “”) unlink(“temp_output.md”)

5. Unique values in some key categorical variables

cat(“## 5. Unique Values in Key Categorical Variables”) unique_categorical <- sapply(AmesHousing[, sapply(AmesHousing, is.character)], unique) for (col in names(unique_categorical)) { cat(paste(“### Unique values in”, col, “”)) cat(paste(“-”, unique_categorical[[col]], collapse = “”)) cat(“”) }

LEVEL 2: Data Extraction & Filtering

cat(“## LEVEL 2: Data Extraction & Filtering”) cat(“——————————-”)

6. Houses sold for more than $500,000

expensive_houses_count <- nrow(filter(AmesHousing, SalePrice > 500000)) cat(“### 6. Houses Sold for More Than $500,000”) cat(paste(“Number of houses sold for more than $500,000:”, expensive_houses_count, “”))

7. Houses with 3 or more bathrooms

three_plus_baths <- nrow(filter(AmesHousing, Full Bath + Half Bath >= 3)) cat(“### 7. Houses with 3 or More Bathrooms”) cat(paste(“Number of houses with 3 or more bathrooms:”, three_plus_baths, “”))

8. Houses built before 1950

older_houses <- filter(data, Year Built < 1950) cat(“### 8. Houses Built Before 1950”) cat(paste(“Number of houses built before 1950:”, nrow(older_houses), “”)) # You might want to display some of these houses: # head(older_houses)

9. Houses with both a garage and a pool

garage_pool_houses <- filter(data, Garage Area > 0 & Pool Area > 0) cat(“### 9. Houses with Both a Garage and a Pool”) cat(paste(“Number of houses with both a garage and a pool:”, nrow(garage_pool_houses), “”)) # You might want to display some of these houses: # head(garage_pool_houses)

cat(“——————————-”)

LEVEL 3: Grouping & Summarization

cat(“## LEVEL 3: Grouping & Summarization”) cat(“——————————-”)

11. Average sale price per neighborhood

avg_price_neighborhood <- data %>% group_by(Neighborhood) %>% summarise(Avg_Price = mean(SalePrice, na.rm=TRUE)) cat(“### 11. Average Sale Price per Neighborhood”) print(avg_price_neighborhood) cat(“”)

12. Median house age per neighborhood

median_age_neighborhood <- data %>% group_by(Neighborhood) %>% summarise(Median_Age = median(2025 - Year Built, na.rm=TRUE)) cat(“### 12. Median House Age per Neighborhood”) print(median_age_neighborhood) cat(“”)

13. Houses sold per year

houses_sold_year <- data %>% group_by(Yr Sold) %>% summarise(Houses_Sold = n()) cat(“### 13. Houses Sold per Year”) print(houses_sold_year) cat(“”)

14. Average lot area by building type

avg_lot_area_bldg_type <- data %>% group_by(Bldg Type) %>% summarise(Avg_Lot_Area = mean(Lot Area, na.rm=TRUE)) cat(“### 14. Average Lot Area by Building Type”) print(avg_lot_area_bldg_type) cat(“”)

15. Neighborhood with highest average sale price

highest_avg_price_neighborhood <- data %>% group_by(Neighborhood) %>% summarise(Avg_Price = mean(SalePrice, na.rm=TRUE)) %>% arrange(desc(Avg_Price)) %>% head(1) cat(“### 15. Neighborhood with Highest Average Sale Price”) print(highest_avg_price_neighborhood) cat(“”)

cat(“——————————-”)

LEVEL 4: Sorting & Ranking

cat(“## LEVEL 4: Sorting & Ranking”) cat(“——————————-”)

16. Top 10 most expensive houses

top_10_expensive <- data %>% arrange(desc(SalePrice)) %>% head(10) cat(“### 16. Top 10 Most Expensive Houses”) print(top_10_expensive[, c(“PID”, “Address”, “SalePrice”)]) # Display relevant columns cat(“”)

17. Neighborhoods by number of houses sold

houses_sold_by_neighborhood <- data %>% count(Neighborhood, sort = TRUE) cat(“### 17. Neighborhoods by Number of Houses Sold”) print(houses_sold_by_neighborhood) cat(“”)

18. Top 5 largest houses by square footage

largest_houses <- data %>% mutate(Total_SF = Gr Liv Area + Total Bsmt SF) %>% arrange(desc(Total_SF)) %>% head(5) cat(“### 18. Top 5 Largest Houses by Square Footage”) print(largest_houses[, c(“PID”, “Address”, “Total_SF”, “Gr Liv Area”, “Total Bsmt SF”)]) # Display relevant columns cat(“”)

19. Neighborhoods with highest average lot size

highest_avg_lot_size_neighborhood <- data %>% group_by(Neighborhood) %>% summarise(Avg_Lot = mean(Lot Area, na.rm=TRUE)) %>% arrange(desc(Avg_Lot)) cat(“### 19. Neighborhoods with Highest Average Lot Size”) print(highest_avg_lot_size_neighborhood) cat(“”)

20. Houses ranked by number of above ground rooms

ranked_by_rooms <- data %>% arrange(desc(TotRms_AbvGrd)) cat(“### 20. Houses Ranked by Number of Above Ground Rooms”) print(ranked_by_rooms[, c(“PID”, “Address”, “TotRms_AbvGrd”)]) # Display relevant columns cat(“”)

cat(“——————————-”)

LEVEL 5: Feature Engineering

cat(“## LEVEL 5: Feature Engineering”) cat(“——————————-”)

21. House_Age column

data <- data %>% mutate(House Age = 2025 - Year Built) cat(“### 21. House_Age Column Created”) cat(“The ‘House Age’ column has been added to the dataset.”) # Display the first few rows with the new column # head(data[, c(“Year Built”, “House Age”)])

22. Price per square foot

data <- data %>% mutate(Price per Sqft = SalePrice / Gr Liv Area) cat(“### 22. Price per Square Foot Column Created”) cat(“The ‘Price per Sqft’ column has been added to the dataset.”) # Display the first few rows with the new column # head(data[, c(“SalePrice”, “Gr Liv Area”, “Price per Sqft”)])

24. Identify remodeled houses

data <- data %>% mutate(Is Remodeled = ifelse(Year Remod/Add > Year Built, 1, 0)) cat(“### 24. Is Remodeled Column Created”) cat(“The ‘Is Remodeled’ column (1 for remodeled, 0 otherwise) has been added.”) # Display the first few rows with the new column # head(data[, c(“Year Built”, “Year Remod/Add”, “Is Remodeled”)])

25. Percentage of remodeled houses

percentage_remodeled <- mean(data$Is Remodeled, na.rm = TRUE) * 100 cat(“### 25. Percentage of Remodeled Houses”) cat(paste(“Percentage of remodeled houses:”, round(percentage_remodeled, 2), “%”))

cat(“——————————-”)

cat(“## Exploratory Data Analysis - Histograms”) cat(“——————————-”)

1. Sale Price Distribution

p1_hist <- ggplot(AmesHousing, aes(x = SalePrice)) + geom_histogram(binwidth = 10000, fill = “steelblue”, color = “black”) + labs(title = “Distribution of Sale Prices in Ames”, x = “Sale Price”, y = “Frequency”) + theme_minimal() cat(“### 1. Sale Price Distribution”) print(p1_hist) cat(“”) cat(“The distribution of sale prices shows…”) # Add your interpretation here

2. Living Area Distribution

p2_hist <- ggplot(AmesHousing, aes(x = Gr Liv Area)) + geom_histogram(binwidth = 100, fill = “lightcoral”, color = “black”) + labs(title = “Distribution of Above Ground Living Area”, x = “Above Ground Living Area (sq ft)”, y = “Frequency”) + theme_minimal() cat(“### 2. Living Area Distribution”) print(p2_hist) cat(“”) cat(“The distribution of above ground living area shows…”) # Add your interpretation here

3. Lot Size Variation

p3_hist <- ggplot(AmesHousing, aes(x = Lot Area)) + geom_histogram(binwidth = 2000, fill = “lightgreen”, color = “black”) + labs(title = “Distribution of Lot Area”, x = “Lot Area (sq ft)”, y = “Frequency”) + theme_minimal() cat(“### 3. Lot Size Variation”) print(p3_hist) cat(“”) cat(“The distribution of lot sizes shows…”) # Add your interpretation here

4. House Age Distribution

AmesHousing <- AmesHousing %>% mutate(House Age = 2025 - Year Built) # Calculate House Age

p4_hist <- ggplot(AmesHousing, aes(x = House Age)) + geom_histogram(binwidth = 5, fill = “lightblue”, color = “black”) + labs(title = “Distribution of House Age”, x = “House Age (Years)”, y = “Frequency”) + theme_minimal() cat(“### 4. House Age Distribution”) print(p4_hist) cat(“”) cat(“The distribution of house ages shows…”) # Add your interpretation here

5. Histogram Bin Width Impact

Sale Price - Different bin widths

p5_hist1 <- ggplot(AmesHousing, aes(x = SalePrice)) + geom_histogram(binwidth = 5000, fill = “steelblue”, color = “black”) + labs(title = “Sale Price (Binwidth = 5000)”, x = “Sale Price”, y = “Frequency”) + theme_minimal() p5_hist2 <- ggplot(AmesHousing, aes(x = SalePrice)) + geom_histogram(binwidth = 20000, fill = “steelblue”, color = “black”) + labs(title = “Sale Price (Binwidth = 20000)”, x = “Sale Price”, y = “Frequency”) + theme_minimal()

Living Area - Different bin widths

p5_hist3 <- ggplot(AmesHousing, aes(x = Gr Liv Area)) + geom_histogram(binwidth = 50, fill = “lightcoral”, color = “black”) + labs(title = “Living Area (Binwidth = 50)”, x = “Living Area”, y = “Frequency”) + theme_minimal() p5_hist4 <- ggplot(AmesHousing, aes(x = Gr Liv Area)) + geom_histogram(binwidth = 200, fill = “lightcoral”, color = “black”) + labs(title = “Living Area (Binwidth = 200)”, x = “Living Area”, y = “Frequency”) + theme_minimal()

cat(“### 5. Histogram Bin Width Impact”) grid.arrange(p5_hist1, p5_hist2, nrow = 1) grid.arrange(p5_hist3, p5_hist4, nrow = 1) cat(“the bin width affects the level of detail shown in the histogram…”) # Add your interpretation

6. Comparative Distribution (Normalized)

Normalized Sale Price

p6_hist1 <- ggplot(AmesHousing, aes(x = SalePrice, y = ..density..)) + geom_histogram(fill = “steelblue”, color = “black”, alpha = 0.7) + geom_density(color = “darkblue”, size = 1) + labs(title = “Normalized Distribution of Sale Price”, x = “Sale Price”, y = “Density”) + theme_minimal()

Normalized Living Area

p6_hist2 <- ggplot(AmesHousing, aes(x = Gr Liv Area, y = ..density..)) + geom_histogram(fill = “lightcoral”, color = “black”, alpha = 0.7) + geom_density(color = “darkred”, size = 1) + labs(title = “Normalized Distribution of Living Area”, x = “Living Area”, y = “Density”) + theme_minimal()

cat(“### 6. Comparative Distribution (Normalized)”) grid.arrange(p6_hist1, p6_hist2, nrow = 1) cat(“the normalized distributions of Sale Price and Living Area…”) # Add your interpretation

7. Distribution of Year Built

p7_hist <- ggplot(AmesHousing, aes(x = Year Built)) + geom_histogram(binwidth = 5, fill = “lightgreen”, color = “black”) + labs(title = “Distribution of Year Built”, x = “Year Built”, y = “Frequency”) + theme_minimal() cat(“### 7. Distribution of Year Built”) print(p7_hist) cat(“”) cat(“The distribution of house construction over the years shows…”) # Add your interpretation

8. Distribution of Basement Square Footage

p8_hist <- ggplot(AmesHousing, aes(x = Total Bsmt SF)) + geom_histogram(binwidth = 100, fill = “lightyellow”, color = “black”) + labs(title = “Distribution of Total Basement Square Footage”, x = “Total Basement Square Footage (sq ft)”, y = “Frequency”) + theme_minimal() cat(“### 8. Distribution of Basement Square Footage”) print(p8_hist) cat(“”) cat(“The distribution of total basement square footage indicates…”) # Add your interpretation

cat(“——————————-”)

cat(“## Exploratory Data Analysis - Bar Charts”) cat(“——————————-”)

1. Bar Chart of Mean Sale Price by Neighborhood

p9_bar <- ggplot(AmesHousing, aes(x = Neighborhood, y = SalePrice))