Homework 2

library("foreign")

MN <- read.dbf("/Users/Claudio Alvarez/Documents/mnmappluto.dbf")


## Map all of the buildings in Manhattan
plot(MN$YCoord ~ MN$XCoord)

plot of chunk unnamed-chunk-1


summary(MN[, c("YCoord", "XCoord")])
##      YCoord           XCoord       
##  Min.   :     0   Min.   :      0  
##  1st Qu.:207645   1st Qu.: 986617  
##  Median :219274   Median : 991591  
##  Mean   :217078   Mean   : 977235  
##  3rd Qu.:231006   3rd Qu.: 998354  
##  Max.   :259301   Max.   :1009761

# Check each row to see if its Y coordinates are greater than zero
# MN$YCoord > 0

# Check each row to see if its X coordinates are greater than zero
# MN$XCoord > 0

# Combine the above to identify rows that have both X and Y coordinates
# above zero The line below is a double logical expression, both parts
# have to be true for R to return 'TRUE' MN$YCoord > 0 & MN$XCoord > 0

MN <- MN[MN$YCoord > 0 & MN$XCoord > 0, ]

dim(MN)
## [1] 43318    78

plot(MN$YCoord ~ MN$XCoord)

plot of chunk unnamed-chunk-1

Identifying Historics district

## dummy variables

MN$HD <- ifelse(is.na(MN[, "HistDist"]), 0, 1)
MN$HD <- as.factor(MN$HD)

plot(y = MN$YCoord, x = MN$XCoord, col = MN$HD, pch = 16, cex = 0.5, xlab = "X Coordinate", 
    ylab = "Y Coordinate", main = "Manhattan Building Locations")

plot of chunk unnamed-chunk-2



###### subset of the data

# district using a logic operater and bracket notation.
inHD <- MN[MN$HD == 1, ]
# Now do the same, but for buildings NOT in an historic district.
outHD <- MN[MN$HD == 0, ]

Hypothesis testing

# t-test Null Hypothesis: The buildings in an historic district have the
# same value as those outside of an historic district, and difference
# between the two groups is due to random chance.
t.test(x = inHD$AssessTot, y = outHD$AssessTot)
## 
##  Welch Two Sample t-test
## 
## data:  inHD$AssessTot and outHD$AssessTot 
## t = -15.05, df = 43286, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0 
## 95 percent confidence interval:
##  -1724546 -1327117 
## sample estimates:
## mean of x mean of y 
##   1233743   2759574

### Question 1. What does hypothesis test 1 tell you?

# t-test Null hipothesis: the buildings an historic district have the same
# area as those outside of a historic district.

t.test(x = inHD$BldgArea, y = outHD$BldgArea)  #Hypothesis Test 2
## 
##  Welch Two Sample t-test
## 
## data:  inHD$BldgArea and outHD$BldgArea 
## t = -9.037, df = 15819, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0 
## 95 percent confidence interval:
##  -25103 -16154 
## sample estimates:
## mean of x mean of y 
##     22050     42678

## Question 2. What does hypothesis test 2 tell you about the size of the
## buildings inside and outside of historic districts?

## Select buildings on the same block as a historic district Get a list of
## all blocks that contain historic buildings
blocks <- inHD$Block
head(blocks)
## [1] 1 1 7 7 7 7

## Select all buildings (from MN) that are on the same block as historic
## buildings The line below selects all rows where the block column
## contains values in our list of blocks Save the result as a new object
HDB <- MN[MN$Block %in% blocks, ]

## What does the object HDB_out contain?
HDB_out <- HDB[HDB$HD == 0, ]

## Select all rows (buildings) with the same block number as an historical
## district building that are themselves in an historic district.
HDB_in <- HDB[HDB$HD == 1, ]

## Hypothesis 3

t.test(x = HDB_in$AssessTot, y = HDB_out$AssessTot)  #Hypothesis Test 3
## 
##  Welch Two Sample t-test
## 
## data:  HDB_in$AssessTot and HDB_out$AssessTot 
## t = -9.728, df = 4349, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0 
## 95 percent confidence interval:
##  -1507426 -1001727 
## sample estimates:
## mean of x mean of y 
##   1233743   2488319


## Question 3. After controlling for location is the historic district
## designation associated with a difference in property values? Are the
## buildings in the historic district different from their non-historic
## neighbors? Use the p-value from hypothesis test 3 to support your
## conclusions.

# We have a problem.  Some buildings have 0 area (square footage).
summary(HDB_in$BldgArea)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##        0     4160     6370    22100    13000 17600000

# this could mean the lot is vacant, it could be an error. either way it
# makes it hard to compute the price per square foot. We need to exlude
# these zero area buildings from out t-test

# Calcuate price per square foot for historic buildings Only for buildings
# with an area greater than 0

HDB_in_sqft <- HDB_in[HDB_in$BldgArea > 0, "AssessTot"]/HDB_in[HDB_in$BldgArea > 
    0, "BldgArea"]

# Calcuate price per square foot for non-historic buildings

HDB_out_sqft <- HDB_out[HDB_out$BldgArea > 0, "AssessTot"]/HDB_out[HDB_out$BldgArea > 
    0, "BldgArea"]

# Now, use the objects 'HDB_in_sqft' and 'HDB_out_sqft' to construct a
# t-test using the t.test() function. If your output looks like the line
# below you have correctly constructed the t-test:
t.test(HDB_in_sqft, HDB_out_sqft)
## 
##  Welch Two Sample t-test
## 
## data:  HDB_in_sqft and HDB_out_sqft 
## t = -1.664, df = 4521, p-value = 0.09614
## alternative hypothesis: true difference in means is not equal to 0 
## 95 percent confidence interval:
##  -36.413   2.976 
## sample estimates:
## mean of x mean of y 
##     66.76     83.48

## Question 4. Question 4. After controlling for location and building
## size, do historic and non-historic buildings have significantly
## different values? If your conclusion has changed between the 1st and
## 4th hypothesis tests explain why Using the data x and y generated above

## correaltion

cor(x, y)
## Error: object 'y' not found
cor(x, y, method = "pearson")  #Pearson is the default.
## Error: object 'y' not found
cor(x, y, method = "kendall")
## Error: object 'y' not found
cor(x, y, method = "spearman")
## Error: object 'y' not found

cor.test(MN$AssessTot, MN$YCoord)
## 
##  Pearson's product-moment correlation
## 
## data:  MN$AssessTot and MN$YCoord 
## t = -12.43, df = 43316, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0 
## 95 percent confidence interval:
##  -0.06902 -0.05025 
## sample estimates:
##      cor 
## -0.05964
cor.test(MN$AssessTot, MN$YCoord, method = "kendall")
## 
##  Kendall's rank correlation tau
## 
## data:  MN$AssessTot and MN$YCoord 
## z = -60.93, p-value < 2.2e-16
## alternative hypothesis: true tau is not equal to 0 
## sample estimates:
##     tau 
## -0.1952
cor.test(MN$AssessTot, MN$YCoord, method = "spearman")
## Warning: Cannot compute exact p-values with ties
## 
##  Spearman's rank correlation rho
## 
## data:  MN$AssessTot and MN$YCoord 
## S = 1.754e+13, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0 
## sample estimates:
##     rho 
## -0.2947

## Question 5.  Factors other than historic designation affect the price
## of real estate in MN.  Is there a significant correlation between a
## buildings north-south position on the island (YCoord) and its total
## assessed value (AssessTot)? Are downtown buildings worth more than
## uptown buildings? Use cor.test() to answer the question.


summary(MN$NumBldgs)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    1.00    1.00    1.08    1.00  163.00
MN[1:100, "NumBldgs"]
##   [1] 163  12   1   1   1   1   2   1   1   1   1   1   1   1   1   1   1
##  [18]   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
##  [35]   1   1   1   1   6   1   2   1   1   1   1   1   1   1   0   1   1
##  [52]   1   1   1   2   1   2   1   1   2   1   1   1   1   1   1   1   1
##  [69]   1   1   1   1   1   1   1   1   1   1   2   1   1   1   1   1   0
##  [86]   1   1   1   1   1   1   1   1   1   2   1   1   1   1   1
cor.test(MN$NumBldgs, MN$AssessTot, method = "kendall")
## 
##  Kendall's rank correlation tau
## 
## data:  MN$NumBldgs and MN$AssessTot 
## z = 52.41, p-value < 2.2e-16
## alternative hypothesis: true tau is not equal to 0 
## sample estimates:
##   tau 
## 0.202
cor.test(MN$AssessTot, MN$YCoord, method = "spearman")
## Warning: Cannot compute exact p-values with ties
## 
##  Spearman's rank correlation rho
## 
## data:  MN$AssessTot and MN$YCoord 
## S = 1.754e+13, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0 
## sample estimates:
##     rho 
## -0.2947

## null hypothesis: there is not correlation between building northern and
## southern position. This hypothesis is rejected because there are high t
## value and p-value< 0.01. There is a negative correlation between
## assestot and Y coord therefore the buildings located in the northern
## decrease in total assessed value.


## Question 6: Use the layout() function to make a chart showing buildings
## on the island of Manhattan in one panel and a scatter plot of the
## correlation between YCoord and AssessTot in the other panel.


matrx <- matrix(data = c(1, 2), 1, 2)
layout(matrx)
plot(MN$YCoord ~ MN$XCoord, xlab = "X Coordinate", ylab = "Y Coordinate", main = "Manhattan Building Locations")
plot(MN$AssessTot, MN$YCoord, xlim = c(0, 5e+08), main = "Assessment Value by Y Coord", 
    ylab = "Y Coordinate (Larger = Further North)", xlab = "Property Assessment Value ($)")

plot of chunk unnamed-chunk-4