Here, we call the csv file from a GitHub link. We clean up the data table. The columns need to be split, the data needs to be converted from characters into integers, and we need to assign a number to the “Houses” column because it is empty.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(stringr)
# call the csv file from the github link
housePriceNY <- read.table("https://raw.githubusercontent.com/kristinlussi/bridgeRhw2/main/HousesNY.csv")
# clean up the data table
housePriceNY <- separate(housePriceNY, col = V2, into = c("Houses", "Price", "Beds", "Baths", "Size", "Lot"), sep = ",")
# remove the first row
housePriceNY <- housePriceNY[-1, ]
# assign a number to each row in the "Houses" column
# Create a sequence of numbers from 1 to 53
houses_numbers <- 1:53
# Assign the sequence to the "Houses" column of the housePriceNY data frame
housePriceNY$Houses <- houses_numbers
# convert row data from column 2 and row 2 on to integer
housePriceNY <- housePriceNY %>% mutate_at(vars(Price, Beds, Baths, Size, Lot), as.integer)
# show first 20 rows of the data
head(housePriceNY, 20)
## V1 Houses Price Beds Baths Size Lot
## 2 1 1 57 3 2 0 1
## 3 2 2 120 6 2 2 0
## 4 3 3 150 4 2 1 0
## 5 4 4 143 3 2 1 0
## 6 5 5 92 3 1 1 0
## 7 6 6 50 2 1 0 0
## 8 7 7 89 2 2 1 0
## 9 8 8 140 4 3 2 0
## 10 9 9 197 4 2 2 1
## 11 10 10 125 3 2 1 0
## 12 11 11 175 3 2 1 1
## 13 12 12 60 3 1 0 1
## 14 13 13 138 3 2 1 0
## 15 14 14 160 4 2 2 0
## 16 15 15 63 3 2 1 0
## 17 16 16 107 3 1 1 2
## 18 17 17 185 4 3 2 0
## 19 18 18 82 3 1 1 2
## 20 19 19 75 4 2 1 0
## 21 20 20 118 3 1 1 0
Here, we use the summary function to summarize the house price data from the data set. We then display the average price and the median price for each number of beds. We also display the average price and the median price for each house size.
summary(housePriceNY)
## V1 Houses Price Beds Baths
## Min. : 1 Min. : 1 Min. : 38.0 Min. :2.000 Min. :1.000
## 1st Qu.:14 1st Qu.:14 1st Qu.: 82.0 1st Qu.:3.000 1st Qu.:1.000
## Median :27 Median :27 Median :107.0 Median :3.000 Median :2.000
## Mean :27 Mean :27 Mean :113.5 Mean :3.396 Mean :1.792
## 3rd Qu.:40 3rd Qu.:40 3rd Qu.:141.0 3rd Qu.:4.000 3rd Qu.:2.000
## Max. :53 Max. :53 Max. :197.0 Max. :6.000 Max. :3.000
## Size Lot
## Min. :0.00 Min. :0.0000
## 1st Qu.:1.00 1st Qu.:0.0000
## Median :1.00 Median :0.0000
## Mean :1.17 Mean :0.4717
## 3rd Qu.:2.00 3rd Qu.:1.0000
## Max. :3.00 Max. :3.0000
# find the average price by number of bedrooms
avgPriceByBed <- aggregate(Price ~ Beds, data=housePriceNY, FUN = mean)
colnames(avgPriceByBed) <- c("Number of Beds", "Average Price")
print(avgPriceByBed)
## Number of Beds Average Price
## 1 2 82.4000
## 2 3 101.0000
## 3 4 134.8947
## 4 5 147.5000
## 5 6 120.0000
# find the median price by number of bedrooms
medPriceByBed <- aggregate(Price ~ Beds, data=housePriceNY, FUN = median)
colnames(medPriceByBed) <- c("Number of Beds", "Median Price")
print(medPriceByBed)
## Number of Beds Median Price
## 1 2 78.0
## 2 3 97.0
## 3 4 140.0
## 4 5 147.5
## 5 6 120.0
# find the average price by size
avgPriceBySize <- aggregate(Price ~ Size, data=housePriceNY, FUN = mean)
colnames(avgPriceBySize) <- c("Size", "Average Price")
print(avgPriceBySize)
## Size Average Price
## 1 0 73.42857
## 2 1 107.77419
## 3 2 141.57143
## 4 3 179.00000
# find the median price by size
medPriceBySize <- aggregate(Price ~ Size, data=housePriceNY, FUN = median)
colnames(medPriceBySize) <- c("Size", "Median Price")
print(medPriceBySize)
## Size Median Price
## 1 0 75.0
## 2 1 100.0
## 3 2 139.5
## 4 3 179.0
Here, we create a new data frame taking a subset of the columns and rows from the original data.
newHousePrice <- housePriceNY[5:20, c("Houses", "Beds", "Size", "Price")]
#show first 15 rows
head(newHousePrice, 15)
## Houses Beds Size Price
## 6 5 3 1 92
## 7 6 2 0 50
## 8 7 2 1 89
## 9 8 4 2 140
## 10 9 4 2 197
## 11 10 3 1 125
## 12 11 3 1 175
## 13 12 3 0 60
## 14 13 3 1 138
## 15 14 4 2 160
## 16 15 3 1 63
## 17 16 3 1 107
## 18 17 4 2 185
## 19 18 3 1 82
## 20 19 4 1 75
Here, we create new column names for each column in the new data frame, newHousePrice.
# create new column names for the new data frame
colnames(newHousePrice) <- c("House.Number", "Number.of.Beds", "House.Size", "House.Price")
# show first 15 rows
head(newHousePrice, 15)
## House.Number Number.of.Beds House.Size House.Price
## 6 5 3 1 92
## 7 6 2 0 50
## 8 7 2 1 89
## 9 8 4 2 140
## 10 9 4 2 197
## 11 10 3 1 125
## 12 11 3 1 175
## 13 12 3 0 60
## 14 13 3 1 138
## 15 14 4 2 160
## 16 15 3 1 63
## 17 16 3 1 107
## 18 17 4 2 185
## 19 18 3 1 82
## 20 19 4 1 75
Here, we print the mean and median for the same two attributes as before (price by number of bedrooms and price by house size).
The values have changed because this new data frame contains only a subset of the original data. The number of beds ranges from 2 to 4. With the original data table, the number of beds ranged from 2 to 6. With the original data, the house size ranged from 0 to 4, but with this new data frame, the house size ranges from 0 to 2. As a result, the median and mean price values have changed since they are aggregating over a smaller set of data.
# summarize the new data frame
summary(newHousePrice)
## House.Number Number.of.Beds House.Size House.Price
## Min. : 5.00 Min. :2.000 Min. :0.000 Min. : 50.00
## 1st Qu.: 8.75 1st Qu.:3.000 1st Qu.:1.000 1st Qu.: 80.25
## Median :12.50 Median :3.000 Median :1.000 Median :112.50
## Mean :12.50 Mean :3.188 Mean :1.125 Mean :116.00
## 3rd Qu.:16.25 3rd Qu.:4.000 3rd Qu.:1.250 3rd Qu.:145.00
## Max. :20.00 Max. :4.000 Max. :2.000 Max. :197.00
# find the new average price by number of bedrooms
newAvgPriceByBed <- aggregate(House.Price ~ Number.of.Beds, data=newHousePrice, FUN = mean)
colnames(newAvgPriceByBed) <- c("Number.of.Beds", "Average.Price")
print(newAvgPriceByBed)
## Number.of.Beds Average.Price
## 1 2 69.5000
## 2 3 106.6667
## 3 4 151.4000
# find the new median price by number of bedrooms
newMedPriceByBed <- aggregate(House.Price ~ Number.of.Beds, data=newHousePrice, FUN = median)
colnames(newMedPriceByBed) <- c("Number.of.Beds", "Median.Price")
print(newMedPriceByBed)
## Number.of.Beds Median.Price
## 1 2 69.5
## 2 3 107.0
## 3 4 160.0
# find the new average price by size
newAvgPriceBySize <- aggregate(House.Price ~ House.Size, data=newHousePrice, FUN = mean)
colnames(newAvgPriceBySize) <- c("House.Size", "Average.Price")
print(newAvgPriceBySize)
## House.Size Average.Price
## 1 0 55.0
## 2 1 106.4
## 3 2 170.5
# find the new median price by size
newMedPriceBySize <- aggregate(House.Price ~ House.Size, data=newHousePrice, FUN = median)
colnames(newMedPriceBySize) <- c("House.Size", "Median.Price")
print(newMedPriceBySize)
## House.Size Median.Price
## 1 0 55.0
## 2 1 99.5
## 3 2 172.5
Here, we change the number in the “Number.of.Beds” column to read “#” bedrooms instead of only the number. We also multiply the house price by 1000 to get the correct denomination.
# rename "2" to "2 bedrooms"
newHousePrice$Number.of.Beds <- str_replace_all(newHousePrice$Number.of.Beds, "2", "2 Bedrooms")
# rename "3" to "3 bedrooms"
newHousePrice$Number.of.Beds <- str_replace_all(newHousePrice$Number.of.Beds, "3", "3 Bedrooms")
# rename "4" to "4 bedrooms"
newHousePrice$Number.of.Beds <- str_replace_all(newHousePrice$Number.of.Beds, "4", "4 Bedrooms")
# multiply house price column by 1000
newHousePrice$House.Price <- newHousePrice$House.Price * 1000
#show updates and first 15 rows
head(newHousePrice, 15)
## House.Number Number.of.Beds House.Size House.Price
## 6 5 3 Bedrooms 1 92000
## 7 6 2 Bedrooms 0 50000
## 8 7 2 Bedrooms 1 89000
## 9 8 4 Bedrooms 2 140000
## 10 9 4 Bedrooms 2 197000
## 11 10 3 Bedrooms 1 125000
## 12 11 3 Bedrooms 1 175000
## 13 12 3 Bedrooms 0 60000
## 14 13 3 Bedrooms 1 138000
## 15 14 4 Bedrooms 2 160000
## 16 15 3 Bedrooms 1 63000
## 17 16 3 Bedrooms 1 107000
## 18 17 4 Bedrooms 2 185000
## 19 18 3 Bedrooms 1 82000
## 20 19 4 Bedrooms 1 75000