Bridge Homework 2

Here, we call the csv file from a GitHub link. We clean up the data table. The columns need to be split, the data needs to be converted from characters into integers, and we need to assign a number to the “Houses” column because it is empty.

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(stringr)
# call the csv file from the github link
housePriceNY <- read.table("https://raw.githubusercontent.com/kristinlussi/bridgeRhw2/main/HousesNY.csv")
# clean up the data table
housePriceNY <- separate(housePriceNY, col = V2, into = c("Houses", "Price", "Beds", "Baths", "Size", "Lot"), sep = ",")

# remove the first row 
housePriceNY <- housePriceNY[-1, ]

# assign a number to each row in the "Houses" column
# Create a sequence of numbers from 1 to 53
houses_numbers <- 1:53

# Assign the sequence to the "Houses" column of the housePriceNY data frame
housePriceNY$Houses <- houses_numbers

# convert row data from column 2 and row 2 on to integer
housePriceNY <- housePriceNY %>% mutate_at(vars(Price, Beds, Baths, Size, Lot), as.integer)

# show first 20 rows of the data
head(housePriceNY, 20)

##    V1 Houses Price Beds Baths Size Lot
## 2   1      1    57    3     2    0   1
## 3   2      2   120    6     2    2   0
## 4   3      3   150    4     2    1   0
## 5   4      4   143    3     2    1   0
## 6   5      5    92    3     1    1   0
## 7   6      6    50    2     1    0   0
## 8   7      7    89    2     2    1   0
## 9   8      8   140    4     3    2   0
## 10  9      9   197    4     2    2   1
## 11 10     10   125    3     2    1   0
## 12 11     11   175    3     2    1   1
## 13 12     12    60    3     1    0   1
## 14 13     13   138    3     2    1   0
## 15 14     14   160    4     2    2   0
## 16 15     15    63    3     2    1   0
## 17 16     16   107    3     1    1   2
## 18 17     17   185    4     3    2   0
## 19 18     18    82    3     1    1   2
## 20 19     19    75    4     2    1   0
## 21 20     20   118    3     1    1   0

Here, we use the summary function to summarize the house price data from the data set. We then display the average price and the median price for each number of beds. We also display the average price and the median price for each house size.

summary(housePriceNY)

##        V1         Houses       Price            Beds           Baths      
##  Min.   : 1   Min.   : 1   Min.   : 38.0   Min.   :2.000   Min.   :1.000  
##  1st Qu.:14   1st Qu.:14   1st Qu.: 82.0   1st Qu.:3.000   1st Qu.:1.000  
##  Median :27   Median :27   Median :107.0   Median :3.000   Median :2.000  
##  Mean   :27   Mean   :27   Mean   :113.5   Mean   :3.396   Mean   :1.792  
##  3rd Qu.:40   3rd Qu.:40   3rd Qu.:141.0   3rd Qu.:4.000   3rd Qu.:2.000  
##  Max.   :53   Max.   :53   Max.   :197.0   Max.   :6.000   Max.   :3.000  
##       Size           Lot        
##  Min.   :0.00   Min.   :0.0000  
##  1st Qu.:1.00   1st Qu.:0.0000  
##  Median :1.00   Median :0.0000  
##  Mean   :1.17   Mean   :0.4717  
##  3rd Qu.:2.00   3rd Qu.:1.0000  
##  Max.   :3.00   Max.   :3.0000

# find the average price by number of bedrooms
avgPriceByBed <- aggregate(Price ~ Beds, data=housePriceNY, FUN = mean)

colnames(avgPriceByBed) <- c("Number of Beds", "Average Price")

print(avgPriceByBed)

##   Number of Beds Average Price
## 1              2       82.4000
## 2              3      101.0000
## 3              4      134.8947
## 4              5      147.5000
## 5              6      120.0000

# find the median price by number of bedrooms

medPriceByBed <- aggregate(Price ~ Beds, data=housePriceNY, FUN = median)

colnames(medPriceByBed) <- c("Number of Beds", "Median Price")

print(medPriceByBed)

##   Number of Beds Median Price
## 1              2         78.0
## 2              3         97.0
## 3              4        140.0
## 4              5        147.5
## 5              6        120.0

# find the average price by size
avgPriceBySize <- aggregate(Price ~ Size, data=housePriceNY, FUN = mean)

colnames(avgPriceBySize) <- c("Size", "Average Price")

print(avgPriceBySize)

##   Size Average Price
## 1    0      73.42857
## 2    1     107.77419
## 3    2     141.57143
## 4    3     179.00000

# find the median price by size
medPriceBySize <- aggregate(Price ~ Size, data=housePriceNY, FUN = median)

colnames(medPriceBySize) <- c("Size", "Median Price")

print(medPriceBySize)

##   Size Median Price
## 1    0         75.0
## 2    1        100.0
## 3    2        139.5
## 4    3        179.0

Here, we create a new data frame taking a subset of the columns and rows from the original data.

newHousePrice <- housePriceNY[5:20, c("Houses", "Beds", "Size", "Price")]

#show first 15 rows
head(newHousePrice, 15)

##    Houses Beds Size Price
## 6       5    3    1    92
## 7       6    2    0    50
## 8       7    2    1    89
## 9       8    4    2   140
## 10      9    4    2   197
## 11     10    3    1   125
## 12     11    3    1   175
## 13     12    3    0    60
## 14     13    3    1   138
## 15     14    4    2   160
## 16     15    3    1    63
## 17     16    3    1   107
## 18     17    4    2   185
## 19     18    3    1    82
## 20     19    4    1    75

Here, we create new column names for each column in the new data frame, newHousePrice.

# create new column names for the new data frame
colnames(newHousePrice) <- c("House.Number", "Number.of.Beds", "House.Size", "House.Price")

# show first 15 rows
head(newHousePrice, 15)

##    House.Number Number.of.Beds House.Size House.Price
## 6             5              3          1          92
## 7             6              2          0          50
## 8             7              2          1          89
## 9             8              4          2         140
## 10            9              4          2         197
## 11           10              3          1         125
## 12           11              3          1         175
## 13           12              3          0          60
## 14           13              3          1         138
## 15           14              4          2         160
## 16           15              3          1          63
## 17           16              3          1         107
## 18           17              4          2         185
## 19           18              3          1          82
## 20           19              4          1          75

Here, we print the mean and median for the same two attributes as before (price by number of bedrooms and price by house size).

The values have changed because this new data frame contains only a subset of the original data. The number of beds ranges from 2 to 4. With the original data table, the number of beds ranged from 2 to 6. With the original data, the house size ranged from 0 to 4, but with this new data frame, the house size ranges from 0 to 2. As a result, the median and mean price values have changed since they are aggregating over a smaller set of data.

# summarize the new data frame
summary(newHousePrice)

##   House.Number   Number.of.Beds    House.Size     House.Price    
##  Min.   : 5.00   Min.   :2.000   Min.   :0.000   Min.   : 50.00  
##  1st Qu.: 8.75   1st Qu.:3.000   1st Qu.:1.000   1st Qu.: 80.25  
##  Median :12.50   Median :3.000   Median :1.000   Median :112.50  
##  Mean   :12.50   Mean   :3.188   Mean   :1.125   Mean   :116.00  
##  3rd Qu.:16.25   3rd Qu.:4.000   3rd Qu.:1.250   3rd Qu.:145.00  
##  Max.   :20.00   Max.   :4.000   Max.   :2.000   Max.   :197.00

# find the new average price by number of bedrooms
newAvgPriceByBed <- aggregate(House.Price ~ Number.of.Beds, data=newHousePrice, FUN = mean)
colnames(newAvgPriceByBed) <- c("Number.of.Beds", "Average.Price")

print(newAvgPriceByBed)

##   Number.of.Beds Average.Price
## 1              2       69.5000
## 2              3      106.6667
## 3              4      151.4000

# find the new median price by number of bedrooms

newMedPriceByBed <- aggregate(House.Price ~ Number.of.Beds, data=newHousePrice, FUN = median)
colnames(newMedPriceByBed) <- c("Number.of.Beds", "Median.Price")

print(newMedPriceByBed)

##   Number.of.Beds Median.Price
## 1              2         69.5
## 2              3        107.0
## 3              4        160.0

# find the new average price by size
newAvgPriceBySize <- aggregate(House.Price ~ House.Size, data=newHousePrice, FUN = mean)
colnames(newAvgPriceBySize) <- c("House.Size", "Average.Price")

print(newAvgPriceBySize)

##   House.Size Average.Price
## 1          0          55.0
## 2          1         106.4
## 3          2         170.5

# find the new median price by size
newMedPriceBySize <- aggregate(House.Price ~ House.Size, data=newHousePrice, FUN = median)
colnames(newMedPriceBySize) <- c("House.Size", "Median.Price")

print(newMedPriceBySize)

##   House.Size Median.Price
## 1          0         55.0
## 2          1         99.5
## 3          2        172.5

Here, we change the number in the “Number.of.Beds” column to read “#” bedrooms instead of only the number. We also multiply the house price by 1000 to get the correct denomination.

# rename "2" to "2 bedrooms"
newHousePrice$Number.of.Beds <- str_replace_all(newHousePrice$Number.of.Beds, "2", "2 Bedrooms")

# rename "3" to "3 bedrooms"
newHousePrice$Number.of.Beds <- str_replace_all(newHousePrice$Number.of.Beds, "3", "3 Bedrooms")

# rename "4" to "4 bedrooms"
newHousePrice$Number.of.Beds <- str_replace_all(newHousePrice$Number.of.Beds, "4", "4 Bedrooms")

# multiply house price column by 1000
newHousePrice$House.Price <- newHousePrice$House.Price * 1000

#show updates and first 15 rows
head(newHousePrice, 15)

##    House.Number Number.of.Beds House.Size House.Price
## 6             5     3 Bedrooms          1       92000
## 7             6     2 Bedrooms          0       50000
## 8             7     2 Bedrooms          1       89000
## 9             8     4 Bedrooms          2      140000
## 10            9     4 Bedrooms          2      197000
## 11           10     3 Bedrooms          1      125000
## 12           11     3 Bedrooms          1      175000
## 13           12     3 Bedrooms          0       60000
## 14           13     3 Bedrooms          1      138000
## 15           14     4 Bedrooms          2      160000
## 16           15     3 Bedrooms          1       63000
## 17           16     3 Bedrooms          1      107000
## 18           17     4 Bedrooms          2      185000
## 19           18     3 Bedrooms          1       82000
## 20           19     4 Bedrooms          1       75000

Bridge Homework 2

Kristin Lussi

2023-07-23