knitr::opts_chunk$set(echo = TRUE)
chooseCRANmirror(graphics=FALSE, ind=1)

#Problem 1
# Clear all variables and prior sessions
rm(list=ls(all=TRUE))

# Load Libraries
library(DAAG)
library(lattice)
library(foreign)
library(MASS)
library(car)
require(stats)
require(stats4)
library(KernSmooth)
library(fastICA)
library(cluster)
library(leaps)
library(mgcv)
library(rpart)
library(pan)
library(mgcv)
library(DAAG)



library("TTR")
library(tis)
require("datasets")
require(graphics)
library("forecast")
install.packages("astsa")
## 
## The downloaded binary packages are in
##  /var/folders/vc/t_bbs3715hjc0pf0v6j56mb80000gn/T//RtmpnkkZq9/downloaded_packages
require(astsa)
library(xtable)
library(stats)

library(corrplot) # We'll use corrplot later on in this example too.
library(visreg) # This library will allow us to show multivariate graphs.
library("rmarkdown")

data(nsw74psid1)
attach(nsw74psid1)

a) Univariate Histograms

Discussion of Properties: The distrubtion of dummy variables such as trt and black only have frequencies at 0 or 1. For age, the distribution is hump-shaped. Education’s histogram is skewed to the negatively, while Real Earnings is skewed positively.

quartz()
par(mfrow=c(2,2))
truehist(trt,col='steelblue3',main="Treatment Group",xlab="trt", ylab="Fraction")
lines(density(trt),lwd=2)

truehist(age,col='steelblue3',main="Age",xlab="Age(years)", ylab="Fraction")
lines(density(age),lwd=2)

truehist(educ,col='steelblue3',main="Education",xlab="Education(years)", ylab="Fraction")
lines(density(educ),lwd=2)

truehist(black,col='steelblue3',main="Black",xlab="Black", ylab="Fraction")
lines(density(black),lwd=2)

quartz()

par(mfrow=c(2,2))
truehist(hisp,col='steelblue3',main="Hispanic",xlab="Hisp", ylab="Fraction")
lines(density(hisp),lwd=2)

truehist(marr,col='steelblue3',main="Married",xlab="Married", ylab="Fraction")
lines(density(marr),lwd=2)

truehist(nodeg,col='steelblue3',main="No Degree",xlab="No Degree", ylab="Fraction")
lines(density(nodeg),lwd=2)

truehist(re74,col='steelblue3',main="Real Earnings in 1974",xlab="Real Earnings in 1974", ylab="Fraction")
lines(density(re74),lwd=2)

quartz()

par(mfrow=c(2,2))

truehist(re75,col='steelblue3',main="Real Earnings in 1975",xlab="Real Earnings in 1975", ylab="Fraction")
lines(density(re75),lwd=2)

truehist(re78,col='steelblue3',main="Real Earnings in 1978",xlab="Real Earnings in 1978", ylab="Fraction")
lines(density(re78),lwd=2)

b)Full Regression

y1 <- lm(re78~ trt+ (age + educ + re74 + re75) +
    (black + hisp + marr + nodeg), data = nsw74psid1)

summary(y1)
## 
## Call:
## lm(formula = re78 ~ trt + (age + educ + re74 + re75) + (black + 
##     hisp + marr + nodeg), data = nsw74psid1)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -64870  -4302   -435   3786 110412 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -129.74276 1688.51706  -0.077   0.9388    
## trt          751.94643  915.25723   0.822   0.4114    
## age          -83.56559   20.81380  -4.015 6.11e-05 ***
## educ         592.61020  103.30278   5.737 1.07e-08 ***
## re74           0.27812    0.02792   9.960  < 2e-16 ***
## re75           0.56809    0.02756  20.613  < 2e-16 ***
## black       -570.92797  495.17772  -1.153   0.2490    
## hisp        2163.28118 1092.29036   1.981   0.0478 *  
## marr        1240.51952  586.25391   2.116   0.0344 *  
## nodeg        590.46695  646.78417   0.913   0.3614    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10070 on 2665 degrees of freedom
## Multiple R-squared:  0.5864, Adjusted R-squared:  0.585 
## F-statistic: 419.8 on 9 and 2665 DF,  p-value: < 2.2e-16

c)Compute the Mallows CP Statistic for all plausible Models

Choose the model: I chose the model with the 2 variables re74 and re75 because it lowers the Mallows CP Statistic by a significant amount while still following the parsimony principle, as the addition of more variables did not decrease the Mallows CP that much.

quartz()
ss=regsubsets(re78~ trt+ (age + educ + re74 + re75) +
    (black + hisp + marr + nodeg),method=c("exhaustive"),nbest=3,data=nsw74psid1)
subsets(ss,statistic="cp",legend=F,main="Mallows CP",col="steelblue4")
##       Abbreviation
## trt              t
## age              a
## educ             e
## re74           r74
## re75           r75
## black            b
## hisp             h
## marr             m
## nodeg            n
legend(8,1500,bty="n",legend=c('r78=Real earnings in 1975','t=treatment','a=age','e=years of education','r74=Real earnings in 1974','r75=Real earnings in 1975','r74=Real earnings in 1974','b=black','h=hispanic','m=married','n=no degree'),col="steelblue4",cex=5)

d)Plot the residuals vs. the fitted values

Use New Model: y2<- lm(re78~ re74 + re75)

y2<- lm(re78~ re74 + re75)
quartz()
plot(y2$fit,y2$res, pch=20,col="skyblue4",lwd=10)
abline(h=0,lwd=2,col = "red3")

e)Plot and discuss the VIF plot(variance inflation factor plot, computed for the standard errors of linear model coefficient estimates)

vif(y2)
##     re74     re75 
## 3.738247 3.738247

f)Plot and discuss the correlation graph(use the corrplot library)

re75 and re74 have the highest correlations with re78, indicating why these two variables are the best predictors of re78.

quartz()
newdatacor = cor(nsw74psid1)
corrplot(newdatacor, method = "number")

#g)Plot the Cook’s Distance values. Are there any outliers? If so, discuss what you would do with them. #Yes, there are outliers, as some points have significantly larger Cook’s distance statistics. To deal with them, I would create multiple models with and without the outliers, or put less weight on the outliers and compare the results to make sure that the outliers do not have too large or an effect on my model.

quartz()
y2_cook=cooks.distance(y2)
plot(y2_cook,ylab="Cook's distance",type='o',main="Cook's Distance Plot",col="skyblue4", pch=20,lwd=.25)

#h)Plot a histogram of the residuals and discuss the results.

quartz()
truehist(y2$res,15,col="skyblue3",xlab="Residuals",ylab="Fraction",main="Histogram of Residuals")

#i) Plot the QQ Normal Plot and discuss the results. #The QQ Normal Plot reveals that the residuals are not normally distributed, as it is very off-line with the expected 45 degree line produced from a normal distribution.

quartz()
qqnorm(y2$res,col="skyblue4", pch=20,lwd=1,main="QQ Normal Plot")

j) Plot the observed vs. predicted values, overalay a Lowess smoother, and discuss the results.

Discussion of Results: The model fits fairly well, with R squared = 0.826. The Lowess Smoother is pretty similar to the yobs=ypred line, at first showing that the model overpredicts, then underprecdicts the observed values.

quartz()
plot(y2$fit,re78,pch=20,col="skyblue4",cex=1,xlab="Predicted Response",ylab="Observed Response",main="Observed vs. Predicted Response \nNew Model",cex.axis=0.8,cex.main=1.0)
lines(lowess(y2$fit,re78),lwd=2)
abline(0,1,col="red",lwd=2,lty=2)
text(100000,20000,expression(R^2==0.826))
legend(80000,16000, c(expression(y[obs]==y[pred]), "Lowess Smoother"), fill =c("red", "black"),cex=1,bty="y")

Problem 2.2)“Download the U.S. GDP quarterly growth rates and the Standard & Poor’s (SP) 500 quarterly returns. For both series, compute their descriptive statistics and their histograms. Are these two series contemporaneously correlated? Comment on your findings.”

z<-read.table("Problem_2_data.csv",header=T, sep=",")
attach(z)

Descriptive Statistics: The Mean for RETURN is 1.962, with a maximum of 20.117 and a minimum of -26.937. The Mean for GRGDP is 0.7958, with a maximum of 3.9343 and a minimum of -2.7075.

summary(GRGDP)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -2.7075  0.3273  0.7679  0.7958  1.3107  3.9343
summary(RETURN)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -26.937  -0.915   2.023   1.962   5.753  20.117

Histograms

quartz()
par(mfrow=c(2,2))
truehist(GRGDP,col='steelblue3',main="US GDP Qarterly Growth Rates",xlab="GRGDP", ylab="Fraction")
lines(density(GRGDP),lwd=2)

truehist(RETURN,col='steelblue3',main="SP 500 quarterly returns",xlab="RETURN (%)", ylab="Fraction")
lines(density(RETURN),lwd=2)

These two series ARE contemporaneously correlated, as we can see by comparing the time series plots of the two variables GRGDP and RETURN. (Correlation between the realizations of two time series variables in the same time period.)

Need to convert the series into a time series in order to know

# Read in the data into a data file and attach names:
names(z)= c("date","GRGDP","RETURN")
attach(z)

# Convert data to time series format:
GRGDP_ts<-ts(GRGDP,start=c(1950,2),freq=4)
t1<-seq(1950.50, 2012.25,length=length(GRGDP_ts))

RETURN_ts<-ts(RETURN,start=c(1950,2),freq=4)
t2<-seq(1950.50, 2012.25,length=length(RETURN_ts))

#Plot time series
quartz()
par(mfrow=c(2,1))

plot.ts(GRGDP_ts)
nberShade()
plot.ts(RETURN_ts)
nberShade()

Problem 3.1

“1. Download monthly data on real personal consumption expenditures and real disposable personal income from the Federal Reserve Economic Database (FRED) of the St. Louis Fed ( http://research.stlouisfed.org/fred2 ). Take a sample starting on 1959:01 and continuing to the most recent month.

a) Calculate the growth rates of real consumption and disposable personal income and plot the data (simply calculate changes in natural-log). How do you compare the level of volatility on consumption growth with that of income growth? Can you explain it with your knowledge of macroeconomics? ( Hint: Consider the permanent income model.)

I would say that the level of volatility of consumption growth is less volatile than that of income growth. In terms of macroeconomics, this fits the permanent income model, as people will consume based on their lifetime income, not just their current income.

# Read in the data into a data file and attach names:
RPCE_data<-read.table("3.1_exercises_data.csv",header=T, sep=",")
names(RPCE_data)= c("date","rpce","rdpi")
attach(RPCE_data)

# Convert data to time series format:
RPCE_ts<-ts(rpce,start=c(1959,1),freq=12)
RDPI_ts<-ts(rdpi,start=c(1959,1),freq=12)

# Growth rates of Real consumption and Real disposable personal income
#calculate changes in natural log

change_log_RPCE<- log(RPCE_ts)- log(lag(RPCE_ts))

change_log_RDPI<- log(RDPI_ts) - log(lag(RDPI_ts))

t10<-seq(1959, 2012.33,length=length(change_log_RDPI))

r1<-lm(change_log_RPCE ~ t10)
quartz()
par(mfrow=c(2,1))
plot(change_log_RPCE,ylab="Real Consumption Growth", xlab="Time", lwd=2, col='skyblue3', xlim=c(1959,2013))


r2<-lm(change_log_RDPI ~ t10)
plot(change_log_RDPI,ylab="Real Disposable Personal Income Growth", xlab="Time", lwd=2, col='skyblue3', xlim=c(1959,2013))

b) Regress consumption growth on disposable income growth. Interpret the estimated equation and discuss statistical significance.

The estimated equation is change_log_RDPI = 0.969664(change_log_RPCE). This means that for every 0.969664 percent increase in consumption growth, there is a 1 percent increase in disposable income growth. Since p-value=0.0169, this explanatory variable is not statistically significant. Adjusted R squared is 0.007369, extremely low.

r3<-lm(change_log_RDPI ~ change_log_RPCE)

summary(r3)
## 
## Call:
## lm(formula = change_log_RDPI ~ change_log_RPCE)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.049532 -0.002385  0.000158  0.002529  0.043006 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -0.0018274  0.0003103  -5.890 6.26e-09 ***
## change_log_RPCE  0.3042768  0.0509016   5.978 3.77e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.007021 on 637 degrees of freedom
## Multiple R-squared:  0.05312,    Adjusted R-squared:  0.05163 
## F-statistic: 35.73 on 1 and 637 DF,  p-value: 3.768e-09

c) Add a lag of the growth in disposable income to the equation that you estimated in b. Based on your estimates, comment on the possibility of an adjustment lag in consumption growth."

The adjusted R squared went up, so I would say that there is a possibility of an adjustment lag in consumption growth.

lag_growth<- change_log_RDPI

r4<-lm(change_log_RDPI ~ change_log_RPCE)

summary(r4)
## 
## Call:
## lm(formula = change_log_RDPI ~ change_log_RPCE)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.049532 -0.002385  0.000158  0.002529  0.043006 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -0.0018274  0.0003103  -5.890 6.26e-09 ***
## change_log_RPCE  0.3042768  0.0509016   5.978 3.77e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.007021 on 637 degrees of freedom
## Multiple R-squared:  0.05312,    Adjusted R-squared:  0.05163 
## F-statistic: 35.73 on 1 and 637 DF,  p-value: 3.768e-09

Problem 3.3

(1) For Homework 1, Problem 3.3, the frequencies for the respective variables are:a) quarterly b) daily c) daily d) monthly

“Visit again the website of the Federal Reserve Bank in St. Louis ( http://research.stlouisfed.org ) and download the following data. For each data set, plot the time series, and write the exact definition, periodicity, and units. Judge whether the underlying stochastic process may be first and second order weakly stationary. Explain your rationale.”

a) U.S. real GDP

The U.S. real GDP is the inflation adjusted value of the goods and services produced by labor and property located in the United states, measured in billions of chained 2005 dollars and calculated every quarter at a seasonally adjusted annual rate. The increasing trend indicates that this process is not stationary, as its moments are not constant across time.

# Read in the data into a data file and attach names:
USGDP_data<-read.table("3.3a_exercises_data.csv",header=T, sep=",")
names(USGDP_data)= c("date","rgdp")
attach(USGDP_data)

# Convert data to time series format:
RGDP_ts<-ts(rgdp,start=c(1947,1),freq=4)
t4<-seq(1947, 2012,length=length(RGDP_ts))

b) The exchange rate of the Japanese yen against the U.S. dollar

The exhange rate of the Japanese yen against the U.S. dollar is calculated as the noon buying rates in New York City for cable transfers payable in foreign currencies. This is calculated daily, and measured in units of the ratio of Japanese Yen to One U.S. dollar. The increasing trend indicates that this process is not stationary, as its moments are not constant across time.

# Read in the data into a data file and attach names:
JPY_USD_data<-read.table("3.3b_exercises_data.csv",header=T, sep=",")
names(USGDP_data)= c("DATE","jpy_usd")
attach(JPY_USD_data)

# Convert data to time series format:
JPY_USD_ts<-ts(rgdp,start=c(1971,4),freq=365)
t5<-seq(1971, 2012.5,length=length(JPY_USD_ts))

c)The 10-year U.S. Treasury constant maturity yield

The 10-year U.S. Treasury Maturity Yield is the yield to maturity of 10-Year Zero Coupon Bonds issued by the Treasury, measured daily and in units of percent. The increasing trend indicates that this process is not stationary, as its moments are not constant across time.

# Read in the data into a data file and attach names:
TreasuryYield_data<-read.table("3.3c_exercises_data.csv",header=T, sep=",")
names(TreasuryYield_data)= c("DATE","CMRate10Yr")
attach(TreasuryYield_data)

# Convert data to time series format:
Yield_ts<-ts(rgdp,start=c(1962,2),freq=365)
t6<-seq(1962, 2012.5,length=length(Yield_ts))

d)The U.S. unemployment rate

The U.S. unemployment rate is representative of the number of unemployed as a percentage of the labor force. The labor force data are restricted to people 16 years or older, live in the U.S., and are not institutionalized or in the military. The statistic is calculated monthly, with units of percent, and is seasonally adjusted. This series appears to be weakly stationary, as mean and covariance appear constant acrosss time.

# Read in the data into a data file and attach names:
unemployment_data<-read.table("3.3d_exercises_data.csv",header=T, sep=",")
names(unemployment_data)= c("DATE","unemrate")
attach(unemployment_data)

# Convert data to time series format:
unemployment_ts<-ts(unemrate,start=c(1948,1),freq=12)
t7<-seq(1948, 2012.4167,length=length(unemployment_ts))

#Plot time series for part a-d
quartz()
par(mfrow=c(2,2))

plot.ts(RGDP_ts)
nberShade()
plot.ts(JPY_USD_ts)
nberShade()
plot.ts(Yield_ts)
nberShade()
plot.ts(RETURN_ts)
nberShade(unemployment_ts)

Probelm 4.3

“Download the same data as in Exercise 1, but at the quarterly frequency. Compute the ACF and PACF functions of quarterly house prices, interest rates, house price growth, and interest rates changes. Comment on the differences across autocorrelation functions. In which series do you find stronger time dependence? Examine the differences in the autocorrelation functions of quarterly data versus annual data.”

Differences across autocorrelation functions include highest ACFs for Quarterly House Prices, then ACFs of Interest Rates, with smaller ACFs for House Price Growth, then smallest ACFs for Interest Rates Changes. In the times series of Quarterly House Prices, I find a stronger time dependence. Examining the differences in the autocorrelation functions of quarterly data versus annual data, I see that the autocorrelation functions have higher values in the quarterly data compared to those of the annual data.

# Read in the data into a data file and attach names:
Housing_data<-read.table("Chapter4.3_exercises_data.csv",header=T, sep=",")
names(Housing_data)= c("OBS","P","R")
attach(Housing_data)

# Convert data to time series format:
Housing_ts<-ts(P,start=c(1980,2),freq=4)
t8<-seq(1980.25, 2011.75,length=length(Housing_ts))

Pgrowth = log(Housing_ts)-log(lag(Housing_ts))

Housinggrowth_ts<-ts(Pgrowth,start=c(1980,2),freq=4)

Int_rate_ts<-ts(R,start=c(1980,2),freq=4)

Rchange = log(Int_rate_ts)-log(lag(Int_rate_ts))

Int_rate_change_ts<-ts(Rchange,start=c(1980,2),freq=4)

#Quarterly House Prices
quartz()
par(mfrow=c(2,1))

#ACF Functions
acf(Housing_ts,main="ACF of Quarterly House Prices")

#PACF Functions
pacf(Housing_ts,main="PACF of the Quarterly House Prices")

#**************************  
#House Price Growth
quartz()
par(mfrow=c(2,1))

#ACF Functions
acf(Housinggrowth_ts,main="ACF of House Price Growth")

#PACF Functions
pacf(Housinggrowth_ts,main="PACF of House Price Growth")

#**************************
#Interest Rates
quartz()
par(mfrow=c(2,1))
#ACF Functions
acf(Int_rate_ts,main="ACF of Interest Ratee")

#PACF Functions
pacf(Int_rate_ts,main="PACF of the Interest Rates")

#**************************
#Interest rates changes
quartz()
par(mfrow=c(2,1))
#ACF Functions
acf(Int_rate_change_ts,main="ACF of the Interest Rates Changes")

#PACF Functions
pacf(Int_rate_change_ts,main="PACF of the Interest Rates Changes")