#install.packages("Ecdat")
library(Ecdat)
## 
## Attaching package: 'Ecdat'
## The following object is masked from 'package:datasets':
## 
##     Orange
Housing <- as.data.frame(Housing)
head(Housing, 10 )
##    price lotsize bedrooms bathrms stories driveway recroom fullbase gashw airco
## 1  42000    5850        3       1       2      yes      no      yes    no    no
## 2  38500    4000        2       1       1      yes      no       no    no    no
## 3  49500    3060        3       1       1      yes      no       no    no    no
## 4  60500    6650        3       1       2      yes     yes       no    no    no
## 5  61000    6360        2       1       1      yes      no       no    no    no
## 6  66000    4160        3       1       1      yes     yes      yes    no   yes
## 7  66000    3880        3       2       2      yes      no      yes    no    no
## 8  69000    4160        3       1       3      yes      no       no    no    no
## 9  83800    4800        3       1       1      yes     yes      yes    no    no
## 10 88500    5500        3       2       4      yes     yes       no    no   yes
##    garagepl prefarea
## 1         1       no
## 2         0       no
## 3         0       no
## 4         0       no
## 5         0       no
## 6         0       no
## 7         2       no
## 8         0       no
## 9         0       no
## 10        1       no

Explain the data set

Since I am interested in real estate I chose data set which includes the information about houses sold in the United States. It contains 546 observations and 12 different variables describing the characteristics of the houses and their neighborhoods.For the purposes of this task I took 4 variables (price, lotsize, bedrooms, bathrooms, air conditioning).

Description: Price: The selling price of the house (in thousands of dollars) Lotsize:The size of the lot (square feet) Bedrooms: Number of bedrooms in the house. Bathrooms: Number of bathrooms. Airco: Factor variable (yes/no). Does the house have air conditioning?

#Perform some data manipulations (create new variable, delete some units due to missing data, rename variables, create new data.frame based on conditions, etc.).

head(Housing[order(Housing$price), c("price", "lotsize", "bedrooms", "bathrms", "airco")], 10)
##     price lotsize bedrooms bathrms airco
## 56  25000    3620        2       1    no
## 163 25000    2910        3       1    no
## 233 25000    3850        3       1    no
## 57  25245    2400        3       1    no
## 239 26000    3000        2       1    no
## 139 26500    2990        2       1    no
## 13  27000    1700        3       1    no
## 244 27000    3649        2       1    no
## 185 28000    3420        5       1    no
## 60  30000    2400        3       1    no

Here I’ve ordered houses by their price (table starts with cheapest price).

#data manipulation

Housing$price_per_sqft <- Housing$price / Housing$lotsize

colnames(Housing)[colnames(Housing) == "bathrms"] <- "baths"

head(Housing[order(Housing$price), c("price", "lotsize", "bedrooms", "baths", "airco", "price_per_sqft")], 10)
##     price lotsize bedrooms baths airco price_per_sqft
## 56  25000    3620        2     1    no       6.906077
## 163 25000    2910        3     1    no       8.591065
## 233 25000    3850        3     1    no       6.493506
## 57  25245    2400        3     1    no      10.518750
## 239 26000    3000        2     1    no       8.666667
## 139 26500    2990        2     1    no       8.862876
## 13  27000    1700        3     1    no      15.882353
## 244 27000    3649        2     1    no       7.399287
## 185 28000    3420        5     1    no       8.187135
## 60  30000    2400        3     1    no      12.500000

I’ve created new variable called price_per_square meter, which is calculated by dividing the house price by the total lot size. Lot size is the total land area of the property including house, yard, driveway, not just building. So the variable price_per_sqft tells us the price of the property per square foot of land, which is useful to compare properties relative to their lot size, especially if some houses are very small or very large lots. Moreover, I renamed the variable bathrms to baths, so the column now clearly indicates the number of bathrooms in each house.

#Present the descriptive statistics for the selected variables and explain at least 3 sample statistics (mean, median, etc.).

summary(Housing[, c("price", "lotsize", "bedrooms", "baths", "airco", "price_per_sqft")])
##      price           lotsize         bedrooms         baths       airco    
##  Min.   : 25000   Min.   : 1650   Min.   :1.000   Min.   :1.000   no :373  
##  1st Qu.: 49125   1st Qu.: 3600   1st Qu.:2.000   1st Qu.:1.000   yes:173  
##  Median : 62000   Median : 4600   Median :3.000   Median :1.000            
##  Mean   : 68122   Mean   : 5150   Mean   :2.965   Mean   :1.286            
##  3rd Qu.: 82000   3rd Qu.: 6360   3rd Qu.:3.000   3rd Qu.:2.000            
##  Max.   :190000   Max.   :16200   Max.   :6.000   Max.   :4.000            
##  price_per_sqft  
##  Min.   : 3.863  
##  1st Qu.:10.648  
##  Median :13.613  
##  Mean   :14.194  
##  3rd Qu.:16.917  
##  Max.   :37.714

The lowest selling price among all the houses is 25,000 dollars. The highest selling price of a house is 190,000 dollars. On average, houses are sold for 68,122 dollars (mean).Half of houses have price up to 62,000$, while the other half have a price above 62,000 dollars (median). 25% of houses cost less than 49,125 dollars. 75% of houses cost less than 82,000 dollars.

The minimum lotsize among all the houses is 1650 sq. feet. The largest lot size is 16200 square feet. On average, houses have a lot of 5150 sq ft. (mean). Half of houses have a lot size up to 4,600 sq ft, while the other half have a lot size above 4,600 sq ft. 25% of the houses have a lot size smaller than 3,600 sq ft and 75% of the houses have a lot size smaller than 6,360 sq ft.

Minimum number of bedrooms in the house is 1, while the maximum number of bedrooms is 6. On average, there are 2.965 bedrooms per house. Half of the houses have up to 3 bedrooms, while the other half have more than 3 bedrooms (median). 25% of houses have 2 or less bedrooms. 75% of houses have 3 or less bedrooms.

Minimum number of bathrooms in the house is 1, while the maximum number of bathrooms is 4. On average, there are 1.286 bathrooms per house. Half of the houses have up to 1 bathroom, while the other half have more than 1 bathroom (median). 25% of houses have 1 or less bathrooms. 75% of houses have 2 or less bathrooms.

The number of houses that have air conditioning is 173, while 373 houses do not have it.

The cheapest house per square foot of lot costs 3.86 dollars per sq ft, while the most expensive house per square foot of the lot costs about 37.1 dollars per sq ft. On average, the price per square foot of lot is 14.19 dollars (mean). Half of the houses have a price per square foot of the lot up to 13.613 dollars, while the other half have price per square foot above 13.62 dollars (median). 25% of houses have a price per square foot of the lot below 10.648 dollars. 75% of the houses have a price per square foot of the lot below 16..917 dollars.

#Graph the distribution of the variables using histograms, scatterplots, and/or boxplots. Explain the results.

hist(Housing$price,
     main = "Histogram of House Prices", 
     xlab = "Price ($)", 
     col= "hotpink",
    border = "pink",
     breaks = 20)

#Histogram of house prices is right-skewed, because most of the houses are cheaper, and very few are expensive, with the highest selling price of a house of 190,000 dollars (maximum).

hist(Housing$price_per_sqft,
     main = "Histogram of Price per sq ft", 
     xlab = "Price per sq ft ($)", 
     col= "navy",
    border = "pink",
     breaks = 20)

#Most of the bars are centered around the median 13.61 dollar/sq ft, which meanss that most houses have a typical value near this central price. The longer tails at the high end indicate outliers - houses with unusually high lot prices per sq ft, which could be due to smaller urban lots with high demand or premium locations.

plot(Housing$bedrooms, Housing$price, 
     main = "Price vs. Number of Bedrooms", 
     xlab = "Bedrooms", 
     ylab = "Price ($)",
     pch = 19, 
     col = "pink")

#This scatter plot is Bedroom vs. Price. It shows how the number of bedrooms affects the house price. The general rule is that more bedrooms generally mean higher prices. Here we can see how the number of the bedrooms affects price and there exist positive relationship between the number of bedrooms and house price, meaning that on average, houses with more bedrooms tend to be more expensive. However, while houses with more bedrooms tend to be more expensive, some houses with fewer bedrooms can still have high prices. This is because other variables like better location, larger lot size, or additional features such as air conditioning, also influence the price.