library(ggplot2)
require(gridExtra)
library(stargazer)

Transformations

setwd('~/desktop/Ostats')
options(digits=2)
nitrate <- read.table("Nitrate.txt", header=TRUE)  ## Read in data
head(nitrate)  ## Display first 6 rows of dataset
##      RIVER   COUNTRY DISCHARG RUNOFF PREC    AREA DENSITY  NO3  DEP NPREC
## 1    Adige     Italy  2.2e+02   18.3   85    1220   102.0 67.0 1238  46.0
## 2   Amazon S_America  1.8e+05   24.8  181 7050000     1.0  3.0  121   2.1
## 3   Caragh   Ireland  7.3e+00   45.6  105     160     7.2  3.6   86   2.6
## 4 Columbia       USA  7.9e+03   11.8   99  670000    10.0 26.6   63   2.0
## 5   Danube   Rumania  6.5e+03    8.1   58  805000    90.0 46.0  826  45.0
## 6 Delaware       USA  3.4e+02   19.1  107   17600   100.0 61.0  852  25.0
names(nitrate)  ## Retrieve names of variables from dataset
##  [1] "RIVER"    "COUNTRY"  "DISCHARG" "RUNOFF"   "PREC"     "AREA"    
##  [7] "DENSITY"  "NO3"      "DEP"      "NPREC"
## Add transformed variables to the data.  We will consider log, sqrt, and third power.
nitrate <- cbind(nitrate,
                 DISCHARG.2=(nitrate$DISCHARG)^.5,     # column DISCHARG.2 contains the sqrt DISCHARG values
                 DISCHARG.ln=log(nitrate$DISCHARG),    # column DISCHARG.ln contains the natural logs of the DISCHARG values
                 DISCHARG.3=(nitrate$DISCHARG)^(1/3),  # column DISCHARG.3 contains the DISCHARG values to the third power
                 RUNOFF.2=(nitrate$RUNOFF)^.5,         # column RUNOFF.2 contains the sqrt RUNOFF values
                 RUNOFF.ln=log(nitrate$RUNOFF),        # column RUNOFF.ln contains the natural logs of the RUNOFF values
                 RUNOFF.3=(nitrate$RUNOFF)^(1/3),      # column RUNOFF.3 contains the RUNOFF values to the third power values
                 PREC.2=(nitrate$PREC)^.5,             # column PREC.2 contains the sqrt PREC values
                 PREC.ln=log(nitrate$PREC),            # column PREC.ln contains the natural logs of the PREC values
                 PREC.3=(nitrate$PREC)^(1/3),          # column PREC.3 contains the PREC values to the third power
                 AREA.2=(nitrate$AREA)^.5,             # column AREA.2 contains the sqrt AREA values
                 AREA.ln=log(nitrate$AREA),            # column AREA.ln contains the natural logs of the AREA values
                 AREA.3=(nitrate$AREA)^(1/3),          # column AREA.3 contains the AREA values to the third power
                 DENSITY.2=(nitrate$DENSITY)^.5,       # column DENSITY.2 contains the sqrt DENSITY values
                 DENSITY.ln=log(nitrate$DENSITY),      # column DENSITY.ln contains the natural logs of the DENSITY values
                 DENSITY.3=(nitrate$DENSITY)^(1/3),    # column DENSITY.3 contains the DENSITY values to the third power
                 NO3.2=(nitrate$NO3)^.5,               # column NO3.2 contains the sqrt NO3 values
                 NO3.ln=log(nitrate$NO3),              # column NO3.ln contains the natural logs of the NO3 values
                 NO3.3=(nitrate$NO3)^(1/3),            # column NO3.3 contains the NO3 values to the third power
                 DEP.2=(nitrate$DEP)^.5,               # column DEP.2 contains the sqrt DEP values
                 DEP.ln=log(nitrate$DEP),              # column DEP.ln contains the natural logs of the DEP values
                 DEP.3=(nitrate$DEP)^(1/3),            # column DEP.3 contains the DEP values to the third power
                 NPREC.2=(nitrate$NPREC)^.5,           # column NPREC.2 contains the sqrt NPREC values
                 NPREC.ln=log(nitrate$NPREC),          # column NPREC.ln contains the natural logs of the NPREC values
                 NPREC.3=(nitrate$NPREC)^(1/3)         # column NPREC.3 contains the NPREC values to the third power
                 )

Histograms of predictor variables and their transformations

## Nitrate
## Find the transformations hat best conform to the normality assumption
N <- ggplot(nitrate, aes(x=NO3)) + 
  geom_histogram(colour = "darkgreen", fill = "white") +
  theme_bw()
N2 <- ggplot(nitrate, aes(x=NO3.2)) + 
  geom_histogram(colour = "darkgreen", fill = "white") +
  theme_bw()
Nln <- ggplot(nitrate, aes(x=NO3.ln)) + 
  geom_histogram(colour = "darkgreen", fill = "white") +
  theme_bw()
N3 <- ggplot(nitrate, aes(x=NO3.3)) + 
  geom_histogram(colour = "darkgreen", fill = "white") +
  theme_bw()
grid.arrange(N, N2, Nln, N3, ncol=4)

## Nitrate Precipitation
NP <- ggplot(nitrate, aes(x=NPREC)) + 
  geom_histogram(colour = "darkblue", fill = "white") +
  theme_bw()
NP2 <- ggplot(nitrate, aes(x=NPREC.2)) + 
  geom_histogram(colour = "darkblue", fill = "white") +
  theme_bw()
NPln <- ggplot(nitrate, aes(x=NPREC.ln)) + 
  geom_histogram(colour = "darkblue", fill = "white") +
  theme_bw()
NP3 <- ggplot(nitrate, aes(x=NPREC.3)) + 
  geom_histogram(colour = "darkblue", fill = "white") +
  theme_bw()
grid.arrange(NP, NP2, NPln, NP3, ncol=4)

## Discharge
DG <- ggplot(nitrate, aes(x=DISCHARG)) + 
  geom_histogram(colour = "purple", fill = "white") +
  theme_bw()
DG2 <- ggplot(nitrate, aes(x=DISCHARG.2)) + 
  geom_histogram(colour = "purple", fill = "white") +
  theme_bw()
DGln <- ggplot(nitrate, aes(x=DISCHARG.ln)) + 
  geom_histogram(colour = "purple", fill = "white") +
  theme_bw()
DG3 <- ggplot(nitrate, aes(x=DISCHARG.3)) + 
  geom_histogram(colour = "purple", fill = "white") +
  theme_bw()
grid.arrange(DG, DG2, DGln, DG3, ncol=4)

## Runoff
R <- ggplot(nitrate, aes(x=RUNOFF)) + 
  geom_histogram(colour = "orange", fill = "white") +
  theme_bw()
R2 <- ggplot(nitrate, aes(x=RUNOFF.2)) + 
  geom_histogram(colour = "orange", fill = "white") +
  theme_bw()
Rln <- ggplot(nitrate, aes(x=RUNOFF.ln)) + 
  geom_histogram(colour = "orange", fill = "white") +
  theme_bw()
R3 <- ggplot(nitrate, aes(x=RUNOFF.3)) + 
  geom_histogram(colour = "orange", fill = "white") +
  theme_bw()
grid.arrange(R, R2, Rln, R3, ncol=4)

## Precipitation
P <- ggplot(nitrate, aes(x=PREC)) + 
  geom_histogram(colour = "darkred", fill = "white") +
  theme_bw()
P2 <- ggplot(nitrate, aes(x=PREC.2)) + 
  geom_histogram(colour = "darkred", fill = "white") +
  theme_bw()
Pln <- ggplot(nitrate, aes(x=PREC.ln)) + 
  geom_histogram(colour = "darkred", fill = "white") +
  theme_bw()
P3 <- ggplot(nitrate, aes(x=PREC.3)) + 
  geom_histogram(colour = "darkred", fill = "white") +
  theme_bw()
grid.arrange(P, P2, Pln, P3, ncol=4)

## Area
A <- ggplot(nitrate, aes(x=AREA)) + 
  geom_histogram(colour = "green", fill = "white") +
  theme_bw()
A2 <- ggplot(nitrate, aes(x=AREA.2)) + 
  geom_histogram(colour = "green", fill = "white") +
  theme_bw()
Aln <- ggplot(nitrate, aes(x=AREA.ln)) + 
  geom_histogram(colour = "green", fill = "white") +
  theme_bw()
A3 <- ggplot(nitrate, aes(x=AREA.3)) + 
  geom_histogram(colour = "green", fill = "white") +
  theme_bw()
grid.arrange(A, A2, Aln, A3, ncol=4)

## Density
Dn <- ggplot(nitrate, aes(x=DENSITY)) + 
  geom_histogram(colour = "black", fill = "white") +
  theme_bw()
Dn2 <- ggplot(nitrate, aes(x=DENSITY.2)) + 
  geom_histogram(colour = "black", fill = "white") +
  theme_bw()
Dnln <- ggplot(nitrate, aes(x=DENSITY.ln)) + 
  geom_histogram(colour = "black", fill = "white") +
  theme_bw()
Dn3 <- ggplot(nitrate, aes(x=DENSITY.3)) + 
  geom_histogram(colour = "black", fill = "white") +
  theme_bw()
grid.arrange(Dn, Dn2, Dnln, Dn3, ncol=4)

## Deposition
D <- ggplot(nitrate, aes(x=DEP)) + 
  geom_histogram(colour = "brown", fill = "white") +
  theme_bw()
D2 <- ggplot(nitrate, aes(x=DEP.2)) + 
  geom_histogram(colour = "brown", fill = "white") +
  theme_bw()
Dln <- ggplot(nitrate, aes(x=DEP.ln)) + 
  geom_histogram(colour = "brown", fill = "white") +
  theme_bw()
D3 <- ggplot(nitrate, aes(x=DEP.3)) + 
  geom_histogram(colour = "brown", fill = "white") +
  theme_bw()
grid.arrange(D, D2, Dln, D3, ncol=4)

Scatterplot and Correlation Matrices

theme_set(theme_minimal(20))
library(GGally)
set.seed(1836)
subset <- nitrate[,c(12,17,19,22,25,27,31,33)]
ggpairs(subset, diag=list(continuous="density"), axisLabels='none')

frame <- data.frame(nitrate[,c(12,17,19,22,25,27,31,33)])
cor(frame)
##             DISCHARG.ln PREC.2 PREC.3 AREA.3 DENSITY.3 NO3.ln DEP.3
## DISCHARG.ln        1.00  0.285  0.276  0.798    -0.327  -0.38 -0.26
## PREC.2             0.28  1.000  0.998 -0.019    -0.044  -0.14  0.15
## PREC.3             0.28  0.998  1.000 -0.043    -0.025  -0.11  0.18
## AREA.3             0.80 -0.019 -0.043  1.000    -0.416  -0.43 -0.47
## DENSITY.3         -0.33 -0.044 -0.025 -0.416     1.000   0.90  0.68
## NO3.ln            -0.38 -0.136 -0.112 -0.429     0.899   1.00  0.67
## DEP.3             -0.26  0.148  0.180 -0.472     0.678   0.67  1.00
## NPREC.ln          -0.36 -0.381 -0.354 -0.376     0.650   0.69  0.85
##             NPREC.ln
## DISCHARG.ln    -0.36
## PREC.2         -0.38
## PREC.3         -0.35
## AREA.3         -0.38
## DENSITY.3       0.65
## NO3.ln          0.69
## DEP.3           0.85
## NPREC.ln        1.00
lm1 <- lm(NO3.ln~DISCHARG.ln+RUNOFF.3+PREC.2+AREA.3+DENSITY.3+NPREC.ln+DEP.3, data=nitrate)
stargazer(lm1,type="text",title="Regression Outputs", ci=TRUE, ci.level=0.95,single.row=TRUE)
## 
## Regression Outputs
## ===============================================
##                         Dependent variable:    
##                     ---------------------------
##                               NO3.ln           
## -----------------------------------------------
## DISCHARG.ln            0.010 (-0.170, 0.190)   
## RUNOFF.3              -0.440* (-0.920, 0.038)  
## PREC.2                -0.093 (-0.380, 0.190)   
## AREA.3                -0.004 (-0.014, 0.006)   
## DENSITY.3             0.550*** (0.420, 0.670)  
## NPREC.ln              -0.450 (-1.800, 0.870)   
## DEP.3                  0.250 (-0.310, 0.820)   
## Constant              2.800** (0.440, 5.100)   
## -----------------------------------------------
## Observations                    42             
## R2                             0.850           
## Adjusted R2                    0.810           
## Residual Std. Error       0.590 (df = 34)      
## F Statistic           27.000*** (df = 7; 34)   
## ===============================================
## Note:               *p<0.1; **p<0.05; ***p<0.01

Plots of Response Variable against Each Predictor Variable

p1 <- ggplot(nitrate, aes(AREA.3, NO3.ln)) +
  geom_point() +
  stat_smooth(method="loess")+
  theme_bw()+
  theme(axis.title.x=element_text(size=9.5)) +
  theme(axis.title.y=element_text(size=9.5))
p2 <- ggplot(nitrate, aes(DISCHARG.ln, NO3.ln)) +
  geom_point()  +
  stat_smooth(method="loess")+
  theme_bw()+
  theme(axis.title.x=element_text(size=9.5)) +
  theme(axis.title.y=element_text(size=9.5))
p3 <- ggplot(nitrate, aes(PREC.2, NO3.ln)) +
  geom_point() +
  stat_smooth(method="loess")+
  theme_bw()+
  theme(axis.title.x=element_text(size=9.5)) +
  theme(axis.title.y=element_text(size=9.5))
p4 <- ggplot(nitrate, aes(RUNOFF.3, NO3.ln)) +
  geom_point() +
  stat_smooth(method="loess")+
  theme_bw()+
  theme(axis.title.x=element_text(size=9.5)) +
  theme(axis.title.y=element_text(size=9.5))
p5 <- ggplot(nitrate, aes(NPREC.ln, NO3.ln)) +
  geom_point() +
  stat_smooth(method="loess")+
  theme_bw()+
  theme(axis.title.x=element_text(size=9.5)) +
  theme(axis.title.y=element_text(size=9.5))
p6 <- ggplot(nitrate, aes(DEP.ln, NO3.ln)) +
  geom_point() +
  stat_smooth(method="loess")+
  theme_bw()+
  theme(axis.title.x=element_text(size=9.5)) +
  theme(axis.title.y=element_text(size=9.5))
p7 <- ggplot(nitrate, aes(DENSITY.3, NO3.ln)) +
  geom_point() +
  stat_smooth(method="loess")+
  theme_bw()+
  theme(axis.title.x=element_text(size=9.5)) +
  theme(axis.title.y=element_text(size=9.5))
grid.arrange(p1, p2, p3, p4, p5, p6, p7, ncol=4, nrow=2)

Plots of Residuals

resid <- resid(lm1)
r1 <- ggplot(nitrate, aes(AREA.3, resid)) +
  geom_point() +
  geom_hline(yintercept=0, col="red", linetype="dashed") +
  theme_bw()+
  theme(axis.title.x=element_text(size=9.5)) +
  theme(axis.title.y=element_text(size=9.5))
r2 <- ggplot(nitrate, aes(DISCHARG.ln, resid)) +
  geom_point()  +
  geom_hline(yintercept=0, col="red", linetype="dashed") +
  theme_bw()+
  theme(axis.title.x=element_text(size=9.5)) +
  theme(axis.title.y=element_text(size=9.5))
r3 <- ggplot(nitrate, aes(PREC.2, resid)) +
  geom_point() +
  geom_hline(yintercept=0, col="red", linetype="dashed") +
  theme_bw()+
  theme(axis.title.x=element_text(size=9.5)) +
  theme(axis.title.y=element_text(size=9.5))
r4 <- ggplot(nitrate, aes(RUNOFF.3, resid)) +
  geom_point() +
  geom_hline(yintercept=0, col="red", linetype="dashed") +
  theme_bw()+
  theme(axis.title.x=element_text(size=9.5)) +
  theme(axis.title.y=element_text(size=9.5))
r5 <- ggplot(nitrate, aes(NPREC.ln, resid)) +
  geom_point() +
  geom_hline(yintercept=0, col="red", linetype="dashed") +
  theme_bw()+
  theme(axis.title.x=element_text(size=9.5)) +
  theme(axis.title.y=element_text(size=9.5))
r6 <- ggplot(nitrate, aes(DEP.ln, resid)) +
  geom_point() +
  geom_hline(yintercept=0, col="red", linetype="dashed") +
  theme_bw()+
  theme(axis.title.x=element_text(size=9.5)) +
  theme(axis.title.y=element_text(size=9.5))
r7 <- ggplot(nitrate, aes(DENSITY.3, resid)) +
  geom_point() +
  geom_hline(yintercept=0, col="red", linetype="dashed") +
  theme_bw()+
  theme(axis.title.x=element_text(size=9.5)) +
  theme(axis.title.y=element_text(size=9.5))
grid.arrange(r1, r2, r3, r4, r5, r6, r7, ncol=4, nrow=2)

All Subset Regression