#Loading and preparing the data

library(foreign)

MN <- read.dbf("C:/Users/QINGHUAN/Desktop/mnmappluto.dbf")

#Map all of the buildings in Manhattan

plot(MN$YCoord ~ MN$XCoord)

plot of chunk unnamed-chunk-2

#Look at the range of building x and y coordinates

summary(MN[, c("YCoord", "XCoord")])
##      YCoord           XCoord       
##  Min.   :     0   Min.   :      0  
##  1st Qu.:207645   1st Qu.: 986617  
##  Median :219274   Median : 991591  
##  Mean   :217078   Mean   : 977235  
##  3rd Qu.:231006   3rd Qu.: 998354  
##  Max.   :259301   Max.   :1009761
# Extract the objects that only rows meet the above criterion
MN <- MN[MN$YCoord > 0 & MN$XCoord > 0, ]
# Returns the number of rows and columns in a table
dim(MN)
## [1] 43318    78

#Draw the map

plot(MN$YCoord ~ MN$XCoord)

plot of chunk unnamed-chunk-4

#Find buildings owned by people whose name includes “island”

islandBldgs <- grep("island", MN$OwnerName, ignore.case = TRUE)

#Evaluate all rows in the MN table

MN$HD <- ifelse(is.na(MN$HistDist), 0, 1)
summary(MN$HD)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   0.215   0.000   1.000

#Convert MN$HD to a factor

MN$HD <- as.factor(MN$HD)

summary(MN$HD)
##     0     1 
## 34024  9294

#Draw a crude map of historic districts.

plot(y = MN$YCoord, x = MN$XCoord, col = MN$HD, pch = 16, cex = 0.5)

plot of chunk unnamed-chunk-8

#Buildings in a historic district, the number of rows and columns in a table.

inHD <- MN[MN$HD == 1, ]
dim(inHD)
## [1] 9294   79

#Buildings outside a historic district, the number of rows and columns in a table.

outHD <- MN[MN$HD == 0, ]
dim(outHD)
## [1] 34024    79

#Hypothesis Testing #Two-sided t-test. Null Hypothesis: The designation of historic districts has no effect on property values, the buildings in a historic district have the same value as those outside of a historic district, and difference between the two groups is due to random chance.

t.test(x = inHD$AssessTot, y = outHD$AssessTot)
## 
##  Welch Two Sample t-test
## 
## data:  inHD$AssessTot and outHD$AssessTot 
## t = -15.05, df = 43286, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0 
## 95 percent confidence interval:
##  -1724546 -1327117 
## sample estimates:
## mean of x mean of y 
##   1233743   2759574

#Question1: What does hypothesis test 1 tell you?

#Test 1 tells us that the null hypothesis is wrong, that is, the buildings in and outside of historic districts are significantly different. This can be seen from the large t-statistic and small p-value(smaller than 0.05). The 95 percent confidence interval doesn't include 0, which supports the former conclusion. The lower mean value of x than y indicates that the buildings in a historic district have less property values than the buildings outside of a historic district.

#Hypothesis test2: The null hypothesis is that the area of buildings in or outside of historic districts has no significant difference.

t.test(x = inHD$BldgArea, y = outHD$BldgArea)
## 
##  Welch Two Sample t-test
## 
## data:  inHD$BldgArea and outHD$BldgArea 
## t = -9.037, df = 15819, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0 
## 95 percent confidence interval:
##  -25103 -16154 
## sample estimates:
## mean of x mean of y 
##     22050     42678

#Question2: What does hypothesis test 2 tell you about the size of the buildings inside and outside of historic districts?

#Test2 tells us that the size of buildings in and outside of historic districts are significantly different, because of the high t-statistic and the low p-value. The 95% confidence interval doesn't include 0. The mean size of buildings in historic districts is 22050.04 square footage, while the mean size of buildings outside of historic districts is 42678.31 square footage.

# Get a list of all blocks that contain historic buildings
blocks <- inHD$Block
# Select all buildings (from MN) that are on the same block as historic
# buildings. Here we select all rows where the block column contains
# values in our list of blocks. Save the result as a new object
HDB <- MN[MN$Block %in% blocks, ]
# Buildings have the same blocks with that of historic districts, but
# outside of historic districts
HDB_out <- HDB[HDB$HD == 0, ]
# Buildings have the same blocks with that of historic districts, and
# within historic districts.
HDB_in <- HDB[HDB$HD == 1, ]

#Hypothesis Test 3

t.test(x = HDB_in$AssessTot, y = HDB_out$AssessTot)
## 
##  Welch Two Sample t-test
## 
## data:  HDB_in$AssessTot and HDB_out$AssessTot 
## t = -9.728, df = 4349, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0 
## 95 percent confidence interval:
##  -1507426 -1001727 
## sample estimates:
## mean of x mean of y 
##   1233743   2488319

#Question3: After controlling for location is the historic district designation associated with a difference in property values? Are the buildings in the historic district different from their non-historic neighbors? Use the p-value from hypothesis test 3 to support your conclusions.

#The null hypothesis for test 3 is that the buildings in the historic district are similar to their non-historic neighbors. We reject this null hypothesis because of the high t-statistic and the low p-value. The 95% confidence interval doesn't include 0, and supports the conclusion. The mean property value of buildings in the historic district is about 1.23 million, while the mean property value of buildings in non-historic neighbors is about 2.48 million.

# Calculate price per square foot for historic buildings with an area of
# greater than 0
HDB_in_sqft <- HDB_in[HDB_in$BldgArea > 0, "AssessTot"]/HDB_in[HDB_in$BldgArea > 
    0, "BldgArea"]
# Calculate price per square foot for non-historic buildings with an area
# of greater than 0
HDB_out_sqft <- HDB_out[HDB_out$BldgArea > 0, "AssessTot"]/HDB_out[HDB_out$BldgArea > 
    0, "BldgArea"]

#Test 4: The null hypothesis is that the historic and non-historic buildings have no significantly different values, after controlling for location and building size.

t.test(HDB_in_sqft, HDB_out_sqft)
## 
##  Welch Two Sample t-test
## 
## data:  HDB_in_sqft and HDB_out_sqft 
## t = -1.664, df = 4521, p-value = 0.09614
## alternative hypothesis: true difference in means is not equal to 0 
## 95 percent confidence interval:
##  -36.413   2.976 
## sample estimates:
## mean of x mean of y 
##     66.76     83.48

#Question4:After controlling for location and building size, do historic and non-historic buildings have significantly different values? If your conclusion has changed between the 1st and 4th hypothesis tests explain why.

#We have to accept the null hypothesis because of the low t-statistic and high p-value(>0.05). The 95% confidence interval includs 0, and supports the former conclusion. The result of test 4 is different from tests 1 to 3. Only test 4 controls both location and building size. So I guess building size has some influence on property value but hasn't been tested.

# Correlation
cor(MN$YCoord, MN$AssessTot, method = "pearson")
## [1] -0.05964

cor.test(MN$YCoord, MN$AssessTot, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  MN$YCoord and MN$AssessTot 
## t = -12.43, df = 43316, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0 
## 95 percent confidence interval:
##  -0.06902 -0.05025 
## sample estimates:
##      cor 
## -0.05964

#Question5: Factors other than historic designation affect the price of real estate in MN. Is there a significant correlation between a buildings north-south position on the island(YCoord) and its total assessed value(AssessTot)? Are downtown buildings worth more than uptown buildings? Use cor.test() to answer the question.

#The null hypothesis of test 5 is that there is no significant correlation between a building's north-south position on the island (YCoord) and its total assessed value (AssessTot). The hypothesis has been rejected because of the high t-statistic and low p-value. So there's a significant correlation between a building's north-south position on the island and its total assessed value. The negative correlation between them (about -0.059) indicates that when moving upward along the N-S position, the total assessed value is decreased. So downtown buildings are worth more than uptown buildings.

#Question6: Use the layout() function to make a chart showing buildings on the island of Manhattan in one panel and a scatter plot of the correlation between YCoord and AssessTot in the other panel. If you need help type ? layout.

layout(matrix(c(1, 2), 1, 2, byrow = TRUE))
plot(y = MN$YCoord, x = MN$XCoord, main = "Buildings in Manhattan", ylab = "Y Coordinate", 
    xlab = "X Coordinate")
plot(y = MN$YCoord, x = MN$AssessTot, col = MN$HD, pch = 16, cex = 0.5, main = "Building Value vs. Y Coordinate(N-S Position)", 
    xlab = "Total Assessed Value", ylab = "Y Coordinate")

plot of chunk unnamed-chunk-18