# Homework 2: Hypothesis Testing Done by: Andy Staffer
# install.packages('foreign') #Only need to do this once per computer.
library("foreign")
MN <- read.dbf("G:/Quant/Assignments/Homework2/mnmappluto.dbf")
plot(MN$YCoord ~ MN$XCoord) #This is bad b/c there is data at (0,0).
# Clean up the dbf data...
summary(MN[, c("YCoord", "XCoord")]) #what is the first parameter? Why is it blank?
## YCoord XCoord
## Min. : 0 Min. : 0
## 1st Qu.:207645 1st Qu.: 986617
## Median :219274 Median : 991591
## Mean :217078 Mean : 977235
## 3rd Qu.:231006 3rd Qu.: 998354
## Max. :259301 Max. :1009761
MN_fix <- MN[MN$YCoord > 0 & MN$XCoord > 0, ]
dim(MN) #Number of rows before data was fixed
## [1] 43992 78
dim(MN_fix) #Number of rows after data was fixed
## [1] 43318 78
plot(MN_fix$YCoord ~ MN_fix$XCoord)
# For Fun: Use String-searching functions with grep()
trumpBldgs <- grep("trump", MN_fix$OwnerName, ignore.case = TRUE)
MN_fix[trumpBldgs, c("Address", "AssessTot", "OwnerName")]
## Address AssessTot OwnerName
## 13619 200 EAST 69 STREET 53710629 TRUMP PALACE COMPANY
## 39891 1030 3 AVENUE 23940000 TRUMP PLAZA OWNERS IN
# Identify Historical Districts
summary(MN_fix$HistDist)
## African Burial Ground & The Commons
## 9
## Audubon Terrace
## 7
## Carnegie Hill
## 94
## Charlton-King-Vandam
## 72
## Chelsea
## 142
## Chelsea Historic District Extension
## 108
## East 17th Street/Irving Place
## 10
## Ellis Island
## 1
## Expanded Carnegie Hill
## 267
## Fraunces Tavern Block
## 11
## Gansevoort Market
## 77
## Governors Island
## 1
## Gramercy Park
## 64
## Gramercy Park Extension
## 1
## Greenwich Village
## 1864
## Greenwich Village Extension
## 37
## Hamilton Heights
## 203
## Hamilton Heights Extension
## 51
## Hamilton Heights/Sugar Hill
## 186
## Hamilton Heights/Sugar Hill District Ext
## 15
## Hamilton Heights/Sugar Hill Northeast
## 32
## Hamilton Heights/Sugar Hill Northwest
## 108
## Hardenbergh / Rhinelander
## 7
## Henderson Place
## 21
## Jumel Terrace
## 56
## Ladies' Mile
## 348
## MacDougal-Sullivan Gardens
## 21
## Madison Square North
## 93
## Metropolitan Museum
## 131
## Mount Morris Park
## 257
## Murray Hill
## 76
## Murray Hill Historic District Extension
## 12
## NoHo
## 102
## Noho East
## 39
## Riverside-West End
## 260
## Riverside - West 105th Street
## 30
## Riverside Drive-West 80th-81 Street
## 36
## Sniffen Court
## 9
## SoHo-Cast Iron
## 445
## South Street Seaport
## 70
## South Street Seaport Extension
## 12
## St. Mark's
## 31
## St. Mark's Extension
## 2
## St. Nicholas
## 146
## Stone Street
## 14
## Stuyvesant Square
## 55
## Treadwell Farm
## 76
## Tribeca East
## 188
## Tribeca North
## 62
## Tribeca South
## 62
## Tribeca South Extension
## 23
## Tribeca West
## 179
## Tudor City
## 21
## Turtle Bay Gardens
## 19
## Upper East Side
## 980
## Upper West Side/Central Park West
## 1864
## Weehawken Street
## 9
## West 71st Street
## 34
## West End - Collegiate
## 144
## NA's
## 34024
# Massage the HistDist field using is.na() and ifelse()
MN_fix$HD <- ifelse(is.na(MN_fix[1:dim(MN_fix), "HistDist"]), 0, 1)
## Warning: numerical expression has 2 elements: only the first used
summary(MN_fix$HD) #Didn't give result like previous summary. How can I count number of 1's and 0's to check my work
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 0.215 0.000 1.000
MN_fix$HD <- as.factor(MN_fix$HD)
summary(MN_fix$HD)
## 0 1
## 34024 9294
# Create a categorical map
plot(y = MN_fix$YCoord, x = MN_fix$XCoord, col = MN_fix$HD, psh = 16, cex = 0.5)
## Warning: "psh" is not a graphical parameter
## Warning: "psh" is not a graphical parameter
## Warning: "psh" is not a graphical parameter
## Warning: "psh" is not a graphical parameter
## Warning: "psh" is not a graphical parameter
## Warning: "psh" is not a graphical parameter
# Get historical district data where 1 indicates that the building is IN a
# historical district
inHD <- MN_fix[MN_fix$HD == 1, ]
outHD <- MN_fix[MN_fix$HD == 0, ]
######################################################## Hypothesis
######################################################## Testing######################################################################
######################################################## H_o = historic
######################################################## districts have no
######################################################## effect on
######################################################## property values
######################################################## aka. H_o =
######################################################## inHD_propertyValues
######################################################## =
######################################################## outHD_propertyValues
t.test(x = inHD$AssessTot, y = outHD$AssessTot)
##
## Welch Two Sample t-test
##
## data: inHD$AssessTot and outHD$AssessTot
## t = -15.05, df = 43286, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1724546 -1327117
## sample estimates:
## mean of x mean of y
## 1233743 2759574
######## QUESTION 1: What does this hypothesis test tell yoU?######## We
######## can reject the null hypothesis with 95% confidence and there is a
######## significant difference between the two groups of property values.
# H_o = inHD_size = outHD_size
t.test(x = inHD$BldgArea, y = outHD$BldgArea)
##
## Welch Two Sample t-test
##
## data: inHD$BldgArea and outHD$BldgArea
## t = -9.037, df = 15819, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -25103 -16154
## sample estimates:
## mean of x mean of y
## 22050 42678
######## QUESTION 2: What does hypothesis test2 tell you about the size of
######## buildings inside and outside of historic districts######## We can
######## reject the null hypothesis with 95% confidence and that there is
######## a signicant difference between the two groups of area/size.
# Refine HD_out to include some spatial relationships:
blocks <- inHD$Block #gets the blocks that part of historical districts
head(blocks) #What is this doing? The result was 1 1 7 7 7 7. What do these numbers mean?
## [1] 1 1 7 7 7 7
HDB <- MN_fix[MN_fix$Block %in% blocks, ] #What is %in%? Uses blocks to select appropriate rows from MN_fix?
# Refine definitions of historical districts based on spatial
# relationships: HDB_in is all of the histrocal buildings HDB_out is only
# the non-historical buildings that are on the same block as a historical
# building
HDB_out <- HDB[HDB$HD == 0, ]
HDB_in <- HDB[HDB$HD == 1, ]
# H_o = inHDB_propertyValues = outHDB_propertyValues
t.test(x = HDB_in$AssessTot, y = HDB_out$AssessTot)
##
## Welch Two Sample t-test
##
## data: HDB_in$AssessTot and HDB_out$AssessTot
## t = -9.728, df = 4349, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1507426 -1001727
## sample estimates:
## mean of x mean of y
## 1233743 2488319
######## QUESTION 3: After controlling for location, is the historic
######## district designation associated with a difference in property
######## values? Are the buildings in the distoric district different from
######## their non-historic neighbors? Use the p-value from hypothesis
######## test 3 to support your conclusions.######## After controlling for
######## location, there is a significant difference in property values
######## between buildings that are in versus out of the historic district
######## boundaries. Since the p-value was less than 0.05, we can reject
######## the null hypothesis with a confience of 95%.
# control for size of building and spatial relationships. Calculate price
# per square foot...
summary(HDB_in$BldgArea) #min area is 0.
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 4160 6370 22100 13000 17600000
HDB_in_sqft <- (HDB_in[HDB_in$BldgArea > 0, "AssessTot"]/HDB_in[HDB_in$BldgArea >
0, "BldgArea"]) #calculate price per square foot for hist. bldgs
HDB_out_sqft <- (HDB_out[HDB_out$BldgArea > 0, "AssessTot"]/HDB_out[HDB_out$BldgArea >
0, "BldgArea"]) #price per sq. foot for non-hist bldgs
# H_o = HDB_insqft_PropertyValue = HDB_outsqft_PropertyValue
t.test(x = HDB_in_sqft, y = HDB_out_sqft)
##
## Welch Two Sample t-test
##
## data: HDB_in_sqft and HDB_out_sqft
## t = -1.664, df = 4521, p-value = 0.09614
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -36.413 2.976
## sample estimates:
## mean of x mean of y
## 66.76 83.48
######## QUESTION 4: After controlling for locaiton and building size, do
######## historic and non-historic buildings have significantly different
######## values? If your conclusion has changed between the first and
######## fourth hypothesis tests, explain why.######## With 95%
######## confidence, we can accept the null hypothesis (p-value = 0.096).
######## There is no difference between the property values of histroical
######## and non-historical buildings. This result has changed since the
######## first hypothesis test that just compared building values between
######## all historical and all non-historical buildings. Since, we have
######## refined our serach radius of non-historical buildings to only
######## include those that direcly neighbor historical districts
######## (Tobler's First Law). We also normalized the property values by
######## building area. This remove the bias of very large buildings
######## being more expensive simply because they are larger.
########################################################
######################################################## Correlation######################################################################
######################################################## QUESTION 5:
######################################################## Factors other
######################################################## than historic
######################################################## designation
######################################################## affect the price
######################################################## of real estate in
######################################################## MN. Is there a
######################################################## significant
######################################################## correlation
######################################################## between a
######################################################## building's N-S
######################################################## position on the
######################################################## island and its
######################################################## total assessed
######################################################## value? Are
######################################################## downtown
######################################################## buildings worth
######################################################## more than uptown
######################################################## buildings?########
######################################################## H_o = p = 0
######################################################## between N-S
######################################################## location and
######################################################## property Value
cor.test(MN_fix$YCoord, MN_fix$AssessTot)
##
## Pearson's product-moment correlation
##
## data: MN_fix$YCoord and MN_fix$AssessTot
## t = -12.43, df = 43316, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.06902 -0.05025
## sample estimates:
## cor
## -0.05964
# With 95% confidence, we can reject the null hypothesis. There is a
# negative correlation between the two variables.
plot(MN_fix$YCoord ~ MN_fix$AssessTot)
# normalize the property values based on area.
summary(MN_fix$BldgArea) #Some have areas of 0. Eliminate them.
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 4100 8360 38300 20200 22400000
MN_bigBldg <- MN_fix[MN_fix$BldgArea > 0, ]
MN_bigBldg$normVal <- MN_bigBldg$AssessTot/MN_bigBldg$BldgArea
# H_o = p = 0 between N-W location and property value per square foot.
cor.test(MN_bigBldg$YCoord, MN_bigBldg$normVal)
##
## Pearson's product-moment correlation
##
## data: MN_bigBldg$YCoord and MN_bigBldg$normVal
## t = 0.2044, df = 40826, p-value = 0.8381
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.008689 0.010711
## sample estimates:
## cor
## 0.001011
plot(MN_bigBldg$YCoord ~ MN_bigBldg$normVal)
# Answer: With 95% confidence, we can accept the null hypothesis (p-value
# = 0.8381). There is no correlation between property values in a N-S
# direction.
######## QUESTION 6: Use the layout() function to make a chart showing
######## buildings on the island of Manhattan in one panel and a scatter
######## plot of the correlation between Ycoord and AssessTot in the other
######## panel.######## Answer:
myLayout <- layout(mat = matrix(c(1, 2), 1, 2), respect = TRUE)
plot(MN_fix$YCoord ~ MN_fix$XCoord, xlab = "xCoords", ylab = "yCoords", main = "Manhattan Buildings")
plot(MN_fix$YCoord ~ MN_fix$AssessTot, xlab = "Property Value", ylab = "YCoord(N-S directionality)",
main = "Correlation between N-S position and Property Value")