Tyrha FINAL.R

Tyrha — Dec 15, 2013, 11:42 PM

setwd("~/final")
Error: cannot change working directory
attach(final)
Error: object 'final' not found
summary(final)
Error: object 'final' not found
log_Price=log(Price)
Error: object 'Price' not found
log_Beds=log(Beds)
Error: object 'Beds' not found
log_Price[is.infinite(log_Price)==T] <- NA
Error: object 'log_Price' not found
log_Beds[is.infinite(log_Beds)==T] <- NA
Error: object 'log_Beds' not found

## Q1: non-constant variance, non-normality ##
g <- lm(log_Beds~log_Price + Baths + Year + Size)
Error: object 'log_Beds' not found

# NON-CONSTANT VARIANCE
# 1. Use graphs
plot(fitted(g), abs(residuals(g)), xlab="Fitted", ylab="|Residuals|")
Error: object 'g' not found


# 2. Use tests
summary(lm(abs(residuals(g)) ~ fitted(g))) 
Error: object 'g' not found
# Checking if nonconstant variance is related to a predictor Beds

par(mfrow=c(2,3))
plot(log_Beds, log_Beds, xlab="Beds", ylab="Residuals")
Error: object 'log_Beds' not found
plot(log_Beds, Price, xlab="Price", ylab="Residuals")
Error: object 'log_Beds' not found
plot(log_Beds, Baths, xlab="Baths", ylab="Residuals")
Error: object 'log_Beds' not found
plot(log_Beds, Size, xlab="Size", ylab="Residuals")
Error: object 'log_Beds' not found
plot(log_Beds, Price_Sqft, xlab="Price_Sqft", ylab="Residuals")
Error: object 'log_Beds' not found
plot(log_Beds, Year, xlab="Year", ylab="Residuals")
Error: object 'log_Beds' not found

# NON-NORMALITY #
# 1. QQ-plots for detecting non-normality 
qqnorm(residuals(g), ylab="Residuals")
Error: object 'g' not found
qqline(residuals(g)) 
Error: object 'g' not found

# The histogram is not suitable for detecting nonnormality
hist(residuals(g)) 
Error: object 'g' not found
par(mfrow=c(1,1))

# 2. Test of non-normal errors
shapiro.test(residuals(g))  
Error: object 'g' not found

# 3. Makes adjustments (improve the non-constant variance)
gg <- lm(log_Beds~log_Price + Baths + Size + Price_Sqft + Year)
Error: object 'log_Beds' not found

gs <- lm(sqrt(log_Beds)~log_Price + Baths + Size + Price_Sqft + Year)
Error: object 'log_Beds' not found
par(mfrow=c(2,1))
plot(fitted(gg), residuals(gg), xlab="Fitted", ylab="Residuals")
Error: object 'gg' not found
plot(fitted(gs), residuals(gs), xlab="Fitted", ylab="Residuals")
Error: object 'gs' not found

## Q2: Unusual observations ##
# 1. Use graphs
# 2. Leverage, Cook's distance, and added variable plots:
# Cook's Distance for detecting influential outliers #
cook <- cooks.distance(g)
Error: object 'g' not found

# Half normal plot of Cook's Distance with labels of three largest values #
halfnorm(cook,3,labs=Beds$Price,ylab="Cook's distance")
Error: could not find function "halfnorm"
# Model fit excluding observation with largest Cook's Distance #
g1 <- lm(log_Beds~log_Price + Baths + Size + Price_Sqft + Year, subset=(cook < max(cook)))
Error: object 'log_Beds' not found

# Comparison of model fits with and without influential observation #
coef(g1); coef(g); summary(g1); summary(g) 
Error: object 'g1' not found

# Added variable plot #
prplot(g,1) # The most important predictor is Beds
Error: could not find function "prplot"
halfnorm(lm.influence(g)$hat, labs=Beds$Price, ylab="Leverages")
Error: could not find function "halfnorm"
cook <- cooks.distance(g)
Error: object 'g' not found


## Q3: Generalized Least Squares(GLS) ##(GLS) is a statistical tool used for estimating
##the unknown variables in a linear regression model.

## Q4: Weighted Least Squares ## (WLS)
# WLS is another way of dealing with non-constant error variance is that
# when errors are uncorrelated but have unequal variance where the form of 
# the inequality is known, we can use weighted least squares to handle
# 2. Use a model for error variance as a function of a predictor
g <- lm(Beds~Price + Baths + Size + Price_Sqft + Year)
Error: object 'Beds' not found
g1 <- lm(log(residuals(g)^2)~Price + Baths + Size + Price_Sqft + Year)
Error: object 'g' not found
summary(g1)
Error: object 'g1' not found
# This model is marginally significant; reading from the results, it means that 
# the log of squared residuals is a function of Price_Sqft

# 3. Fit linear model using WLS
# Size is the variable that is going to be used as weight
Beds$Size[Beds$Size == "1450"] <- 1
Error: object 'Beds' not found
Beds$Size[Beds$Size == "1600"] <- 2
Error: object 'Beds' not found
Beds$Size[Beds$Size == "2100"] <- 3
Error: object 'Beds' not found
Beds$Size[Beds$Size == "2400"] <- 4
Error: object 'Beds' not found
Beds$Size[Beds$Size == "2200"] <- 5
Error: object 'Beds' not found
Beds$Size[Beds$Size == "1900"] <- 6
Error: object 'Beds' not found
Beds$Size[Beds$Size == "1800"] <- 7
Error: object 'Beds' not found
Beds$Size[Beds$Size == "2605"] <- 8
Error: object 'Beds' not found

# This syntax output is the results of the WLS regression
g <- lm(log_Beds~ Size + log_Price + Baths + Price_Sqft + Year,weight = 1/Size)
Error: object 'log_Beds' not found
coef(g)
Error: object 'g' not found

## Q5: Test of Lack of Fit ##
g <- lm(log_Beds~Size)
Error: object 'log_Beds' not found
summary(g)
Error: object 'g' not found
plot(log_Beds~Size)
Error: object 'log_Beds' not found
# Therefore, the regression of Beds on Size is below and found model fits well (Multiple R-squared:  0.1537; p-value: 0.02648)

ga <- lm(log_Beds~factor(Size))
Error: object 'log_Beds' not found
summary(ga)
Error: object 'ga' not found
# Fitting the data with a group means model to see if there is any improved fit
# Results reveal that there is indeed an improved fit (Multiple R-squared:  0.9738; p-value: .0539)

# Testing the difference between model(g) and model(ga)
anova(g, ga)
Error: object 'g' not found
#Therefore, Model 2 is a better fitting model than Model 1.

## Q6: Robust Regression ##
# Variables used: Beds, Price, Baths, Size, Price_Sqft, Year
g <- lm(log_Beds~log_Price + Baths + Size + Price_Sqft + Year)
Error: object 'log_Beds' not found
summary(g)$coef
Error: object 'g' not found
shapiro.test(residuals(g))
Error: object 'g' not found
library(MASS)
gr <- rlm(log_Beds~log_Price + Baths + Size + Price_Sqft + Year)
Error: object 'log_Beds' not found
summary(gr)
Error: object 'gr' not found

# Huber method of regression: 
gr <- rq(log_Beds~log_Price + Baths + Size + Price_Sqft + Year)
Error: could not find function "rq"
summary(gr)$coef
Error: object 'gr' not found

#LAD method regression: Shows some improvement
ltsreg(log_Beds~log_Price + Baths + Size + Price_Sqft + Year)$coef
Error: object 'log_Beds' not found

# Comparison of Huber, LAD, LTS
plot(log_Beds ~ log_Price)
Error: object 'log_Beds' not found

plot(log_Beds ~ log_Price)
Error: object 'log_Beds' not found
abline(lm(log_Beds ~ log_Price)$coef) # LS
Error: object 'log_Beds' not found
abline(rlm(log_Beds ~ log_Price)$coef, lty=2) # Huber
Error: object 'log_Beds' not found
abline(reg(log_Beds ~ log_Price)$coef, lty=5) # LAD
Error: could not find function "reg"
abline(ltsreg(log_Beds ~ log_Price)$coef, lty=7) # LTS
Error: object 'log_Beds' not found

## Q7: Automated Variable Selection ##
# Variables used:Beds, Price, Baths, Size, Price_Sqft, Year
# 1. Role of ANOVA in Big vs. Small models
g1 <- lm(log_Beds~log_Price + Baths + Size + Price_Sqft + Year)
Error: object 'log_Beds' not found
summary(g1)
Error: object 'g1' not found

# Baths is least sigificant, so we remove it in a smaller model
g2 <- lm(log_Beds~log_Price + Size + Price_Sqft + Year)
Error: object 'log_Beds' not found
summary(g2)
Error: object 'g2' not found
#p value is slightly marginally significant when you remove "Baths"

g3 <- lm(log_Beds~log_Price + Size + Price_Sqft)
Error: object 'log_Beds' not found
summary(g3)
Error: object 'g3' not found
# Removing Year; makes the p value of the model marginally significant

# 2. Backward variable selection
g <- lm(log_Beds~log_Price + Baths + Size + Price_Sqft + Year)
Error: object 'log_Beds' not found
summary(g)
Error: object 'g' not found

#Remove Baths
g <- lm(log_Beds~log_Price + Size + Price_Sqft + Year)
Error: object 'log_Beds' not found
summary(g)
Error: object 'g' not found

#Remove Year
g <- lm(log_Beds~log_Price + Size + Price_Sqft)
Error: object 'log_Beds' not found
summary(g)
Error: object 'g' not found
#Therefore, when you remove Baths and Year, the model is marginally significant
#Remove Size
g <- lm(log_Beds~log_Price + Price_Sqft)
Error: object 'log_Beds' not found
summary(g)
Error: object 'g' not found
#Overall, once you remove Baths, Year and Size, the model becomes significant with a p=.02726 (p<.05)

# 3. Stepwise regression
g <- lm(log_Beds~log_Price + Baths + Price_Sqft + Size + Year)
Error: object 'log_Beds' not found
step(g)
Error: object 'g' not found
# 'AIC" is a criterion-based procedure and stands for Akaike Information Criterion: 
# Selected variables are listed in the last step when AIC=-99.72: Price
# Essentially, Stepwise regression had the same results that backward variable selection had

## Q8: Cross - Validation ##
# Variables used: Beds, Price, Baths, Price_Sqft, Size, Year
# Cross-Validation- the goal of cross-validation is to predict how well the model will perform based on test or practice datasets
# 3. 3-fold cross-validation for this dataset
g <- lm(MV~Beds + Price + Baths + Price_Sqft + Size + Year)
Error: object 'MV' not found
g.step <- step(g)
Error: object 'g' not found
summary(g.step)
Error: object 'g.step' not found
#Cross Validation is significant with the p=.008 (p<.05)

par(mfrow=c(1,1))