library("foreign")
MN <- read.dbf("/Users/Claudio Alvarez/Documents/mnmappluto.dbf")
## Map all of the buildings in Manhattan
plot(MN$YCoord ~ MN$XCoord)
summary(MN[, c("YCoord", "XCoord")])
## YCoord XCoord
## Min. : 0 Min. : 0
## 1st Qu.:207645 1st Qu.: 986617
## Median :219274 Median : 991591
## Mean :217078 Mean : 977235
## 3rd Qu.:231006 3rd Qu.: 998354
## Max. :259301 Max. :1009761
# Check each row to see if its Y coordinates are greater than zero
# MN$YCoord > 0
# Check each row to see if its X coordinates are greater than zero
# MN$XCoord > 0
# Combine the above to identify rows that have both X and Y coordinates
# above zero The line below is a double logical expression, both parts
# have to be true for R to return 'TRUE' MN$YCoord > 0 & MN$XCoord > 0
MN <- MN[MN$YCoord > 0 & MN$XCoord > 0, ]
dim(MN)
## [1] 43318 78
plot(MN$YCoord ~ MN$XCoord)
Identifying Historics district
## dummy variables
MN$HD <- ifelse(is.na(MN[, "HistDist"]), 0, 1)
MN$HD <- as.factor(MN$HD)
plot(y = MN$YCoord, x = MN$XCoord, col = MN$HD, pch = 16, cex = 0.5, xlab = "X Coordinate",
ylab = "Y Coordinate", main = "Manhattan Building Locations")
###### subset of the data
# district using a logic operater and bracket notation.
inHD <- MN[MN$HD == 1, ]
# Now do the same, but for buildings NOT in an historic district.
outHD <- MN[MN$HD == 0, ]
Hypothesis testing
# t-test Null Hypothesis: The buildings in an historic district have the
# same value as those outside of an historic district, and difference
# between the two groups is due to random chance.
t.test(x = inHD$AssessTot, y = outHD$AssessTot)
##
## Welch Two Sample t-test
##
## data: inHD$AssessTot and outHD$AssessTot
## t = -15.05, df = 43286, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1724546 -1327117
## sample estimates:
## mean of x mean of y
## 1233743 2759574
### Question 1. What does hypothesis test 1 tell you?
# t-test Null hipothesis: the buildings an historic district have the same
# area as those outside of a historic district.
t.test(x = inHD$BldgArea, y = outHD$BldgArea) #Hypothesis Test 2
##
## Welch Two Sample t-test
##
## data: inHD$BldgArea and outHD$BldgArea
## t = -9.037, df = 15819, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -25103 -16154
## sample estimates:
## mean of x mean of y
## 22050 42678
## Question 2. What does hypothesis test 2 tell you about the size of the
## buildings inside and outside of historic districts?
## Select buildings on the same block as a historic district Get a list of
## all blocks that contain historic buildings
blocks <- inHD$Block
head(blocks)
## [1] 1 1 7 7 7 7
## Select all buildings (from MN) that are on the same block as historic
## buildings The line below selects all rows where the block column
## contains values in our list of blocks Save the result as a new object
HDB <- MN[MN$Block %in% blocks, ]
## What does the object HDB_out contain?
HDB_out <- HDB[HDB$HD == 0, ]
## Select all rows (buildings) with the same block number as an historical
## district building that are themselves in an historic district.
HDB_in <- HDB[HDB$HD == 1, ]
## Hypothesis 3
t.test(x = HDB_in$AssessTot, y = HDB_out$AssessTot) #Hypothesis Test 3
##
## Welch Two Sample t-test
##
## data: HDB_in$AssessTot and HDB_out$AssessTot
## t = -9.728, df = 4349, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1507426 -1001727
## sample estimates:
## mean of x mean of y
## 1233743 2488319
## Question 3. After controlling for location is the historic district
## designation associated with a difference in property values? Are the
## buildings in the historic district different from their non-historic
## neighbors? Use the p-value from hypothesis test 3 to support your
## conclusions.
# We have a problem. Some buildings have 0 area (square footage).
summary(HDB_in$BldgArea)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 4160 6370 22100 13000 17600000
# this could mean the lot is vacant, it could be an error. either way it
# makes it hard to compute the price per square foot. We need to exlude
# these zero area buildings from out t-test
# Calcuate price per square foot for historic buildings Only for buildings
# with an area greater than 0
HDB_in_sqft <- HDB_in[HDB_in$BldgArea > 0, "AssessTot"]/HDB_in[HDB_in$BldgArea >
0, "BldgArea"]
# Calcuate price per square foot for non-historic buildings
HDB_out_sqft <- HDB_out[HDB_out$BldgArea > 0, "AssessTot"]/HDB_out[HDB_out$BldgArea >
0, "BldgArea"]
# Now, use the objects 'HDB_in_sqft' and 'HDB_out_sqft' to construct a
# t-test using the t.test() function. If your output looks like the line
# below you have correctly constructed the t-test:
t.test(HDB_in_sqft, HDB_out_sqft)
##
## Welch Two Sample t-test
##
## data: HDB_in_sqft and HDB_out_sqft
## t = -1.664, df = 4521, p-value = 0.09614
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -36.413 2.976
## sample estimates:
## mean of x mean of y
## 66.76 83.48
## Question 4. Question 4. After controlling for location and building
## size, do historic and non-historic buildings have significantly
## different values? If your conclusion has changed between the 1st and
## 4th hypothesis tests explain why Using the data x and y generated above
## correaltion
cor(x, y)
## Error: object 'y' not found
cor(x, y, method = "pearson") #Pearson is the default.
## Error: object 'y' not found
cor(x, y, method = "kendall")
## Error: object 'y' not found
cor(x, y, method = "spearman")
## Error: object 'y' not found
cor.test(MN$AssessTot, MN$YCoord)
##
## Pearson's product-moment correlation
##
## data: MN$AssessTot and MN$YCoord
## t = -12.43, df = 43316, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.06902 -0.05025
## sample estimates:
## cor
## -0.05964
cor.test(MN$AssessTot, MN$YCoord, method = "kendall")
##
## Kendall's rank correlation tau
##
## data: MN$AssessTot and MN$YCoord
## z = -60.93, p-value < 2.2e-16
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
## tau
## -0.1952
cor.test(MN$AssessTot, MN$YCoord, method = "spearman")
## Warning: Cannot compute exact p-values with ties
##
## Spearman's rank correlation rho
##
## data: MN$AssessTot and MN$YCoord
## S = 1.754e+13, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.2947
## Question 5. Factors other than historic designation affect the price
## of real estate in MN. Is there a significant correlation between a
## buildings north-south position on the island (YCoord) and its total
## assessed value (AssessTot)? Are downtown buildings worth more than
## uptown buildings? Use cor.test() to answer the question.
summary(MN$NumBldgs)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 1.00 1.00 1.08 1.00 163.00
MN[1:100, "NumBldgs"]
## [1] 163 12 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1
## [18] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [35] 1 1 1 1 6 1 2 1 1 1 1 1 1 1 0 1 1
## [52] 1 1 1 2 1 2 1 1 2 1 1 1 1 1 1 1 1
## [69] 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 0
## [86] 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1
cor.test(MN$NumBldgs, MN$AssessTot, method = "kendall")
##
## Kendall's rank correlation tau
##
## data: MN$NumBldgs and MN$AssessTot
## z = 52.41, p-value < 2.2e-16
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
## tau
## 0.202
cor.test(MN$AssessTot, MN$YCoord, method = "spearman")
## Warning: Cannot compute exact p-values with ties
##
## Spearman's rank correlation rho
##
## data: MN$AssessTot and MN$YCoord
## S = 1.754e+13, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.2947
## null hypothesis: there is not correlation between building northern and
## southern position. This hypothesis is rejected because there are high t
## value and p-value< 0.01. There is a negative correlation between
## assestot and Y coord therefore the buildings located in the northern
## decrease in total assessed value.
## Question 6: Use the layout() function to make a chart showing buildings
## on the island of Manhattan in one panel and a scatter plot of the
## correlation between YCoord and AssessTot in the other panel.
matrx <- matrix(data = c(1, 2), 1, 2)
layout(matrx)
plot(MN$YCoord ~ MN$XCoord, xlab = "X Coordinate", ylab = "Y Coordinate", main = "Manhattan Building Locations")
plot(MN$AssessTot, MN$YCoord, xlim = c(0, 5e+08), main = "Assessment Value by Y Coord",
ylab = "Y Coordinate (Larger = Further North)", xlab = "Property Assessment Value ($)")