DATA 621 - Homework #1 Linear Models with R, 1.1, 1.3, 1.4, 1.5

  1. The dataset teengamb concerns a study of teenage gambling in Britain. Make a numerical and graphical summary of the data,commenting on any features that you find intresing. Limit the output you present to a quantity that a busy reader would find sufficient to get a basic understanding of the data.
library(faraway)
## Warning: package 'faraway' was built under R version 3.4.3
data("teengamb")
head(teengamb)
##   sex status income verbal gamble
## 1   1     51   2.00      8    0.0
## 2   1     28   2.50      8    0.0
## 3   1     37   2.00      6    0.0
## 4   1     28   7.00      4    7.3
## 5   1     65   2.00      8   19.6
## 6   1     61   3.47      6    0.1
tail(teengamb)
##    sex status income verbal gamble
## 42   0     61  15.00      9   69.7
## 43   0     75   3.00      8   13.3
## 44   0     66   3.25      9    0.6
## 45   0     62   4.94      6   38.0
## 46   0     71   1.50      7   14.4
## 47   0     71   2.50      9   19.2
summary(teengamb)
##       sex             status          income           verbal     
##  Min.   :0.0000   Min.   :18.00   Min.   : 0.600   Min.   : 1.00  
##  1st Qu.:0.0000   1st Qu.:28.00   1st Qu.: 2.000   1st Qu.: 6.00  
##  Median :0.0000   Median :43.00   Median : 3.250   Median : 7.00  
##  Mean   :0.4043   Mean   :45.23   Mean   : 4.642   Mean   : 6.66  
##  3rd Qu.:1.0000   3rd Qu.:61.50   3rd Qu.: 6.210   3rd Qu.: 8.00  
##  Max.   :1.0000   Max.   :75.00   Max.   :15.000   Max.   :10.00  
##      gamble     
##  Min.   :  0.0  
##  1st Qu.:  1.1  
##  Median :  6.0  
##  Mean   : 19.3  
##  3rd Qu.: 19.4  
##  Max.   :156.0

Looking at the data, nothing odd jumps out at the immediately. I did notice that one gender is 0, while the other is 1. I’m going to go ahead and sort each of the catagories to see if I can find any interesting things.

sort(teengamb$status)
##  [1] 18 18 18 18 27 27 28 28 28 28 28 28 28 30 30 37 38 38 38 38 38 43 43
## [24] 43 43 47 48 51 51 51 51 51 61 61 61 62 62 62 65 65 66 71 71 71 71 71
## [47] 75
sort(teengamb$income)
##  [1]  0.60  1.00  1.50  1.50  1.50  1.50  1.60  2.00  2.00  2.00  2.00
## [12]  2.00  2.00  2.00  2.20  2.50  2.50  2.50  2.50  3.00  3.00  3.00
## [23]  3.00  3.25  3.47  3.50  3.50  4.00  4.50  4.75  4.94  5.44  5.50
## [34]  5.50  6.00  6.42  6.50  7.00  7.00  8.00  9.50 10.00 10.00 10.00
## [45] 12.00 15.00 15.00
sort(teengamb$verbal)
##  [1]  1  2  4  4  4  4  5  5  5  6  6  6  6  6  6  6  6  6  6  6  6  7  7
## [24]  7  7  7  7  7  7  7  8  8  8  8  8  8  8  8  8  8  9  9  9  9  9  9
## [47] 10
sort(teengamb$gamble)
##  [1]   0.00   0.00   0.00   0.00   0.10   0.10   0.10   0.10   0.10   0.60
## [11]   0.60   1.00   1.20   1.20   1.45   1.70   2.10   2.40   3.00   3.00
## [21]   3.40   3.60   5.40   6.00   6.60   6.90   7.30   8.40   9.60  12.00
## [31]  13.30  14.10  14.40  14.50  19.20  19.60  25.00  38.00  38.50  38.50
## [41]  53.20  57.20  69.70  70.00  88.00  90.00 156.00

The sorted data did not bring much to light in terms of missing data or outliers. I would like to see how many women and men we have.

teengamb$sex <- factor(teengamb$sex)
summary(teengamb$sex)
##  0  1 
## 28 19
levels(teengamb$sex) <- c("male", "female") 
summary(teengamb)
##      sex         status          income           verbal     
##  male  :28   Min.   :18.00   Min.   : 0.600   Min.   : 1.00  
##  female:19   1st Qu.:28.00   1st Qu.: 2.000   1st Qu.: 6.00  
##              Median :43.00   Median : 3.250   Median : 7.00  
##              Mean   :45.23   Mean   : 4.642   Mean   : 6.66  
##              3rd Qu.:61.50   3rd Qu.: 6.210   3rd Qu.: 8.00  
##              Max.   :75.00   Max.   :15.000   Max.   :10.00  
##      gamble     
##  Min.   :  0.0  
##  1st Qu.:  1.1  
##  Median :  6.0  
##  Mean   : 19.3  
##  3rd Qu.: 19.4  
##  Max.   :156.0

We will now complete a few different plots to see if we notice anything cool. A few things I’d like to plot are verbal (which I believe is an indication of education) vs. gamble to see if there’s a correlation between education and gambling. I would also like to see income vs. gamble and lastly I’d want to see socioeconomic status vs gamble. I am working under the assumption that our outcome is “gamble”, while our predictors are “sex, status, income, and verbal”.

hist(teengamb$gamble, xlab = "GAMBLING EXPENDITURE", main = "") #the look of this histogram is a bit worrisome to me, as it is a bit skewed and I wonder is a linear regression is the right move.

plot(density(teengamb$gamble, na.rm = TRUE), main = "")

plot(sort(teengamb$gamble), ylab = "Sorted Expenditure")

When we look at the density plot, we notice that there’s a break in our data, indicating that our response is not continuous I’ll go back and edit the data. It looks like it may be due to the max being 156. I will remove this datapoint and re-evaluate.

teengamb$gamble[teengamb$gamble > 90] <- NA
plot(gamble ~ sex, teengamb)

hist(teengamb$gamble, xlab = "GAMBLING EXPENDITURE", main = "")

plot(density(teengamb$gamble, na.rm = TRUE), main = "") #note that the gap is not longer in our density plot

plot(sort(teengamb$gamble), ylab = "Sorted Expenditure")

Now let’s go back to plot the previously stated and see if we can get some information from this.

plot(teengamb$gamble ~ teengamb$verbal)

plot(teengamb$gamble ~ teengamb$status)

plot(teengamb$gamble ~ teengamb$income)

Now that we have looked at the different plots, we will look at our data numerically.

plot(gamble ~ verbal, teengamb)
abline(lm(gamble ~ verbal, teengamb))

lmod <- lm(gamble ~ verbal, teengamb)
coef(lmod)
## (Intercept)      verbal 
##   26.131518   -1.459223
plot(gamble ~ status, teengamb)
abline(lm(gamble ~ status, teengamb))

lmod2 <- lm(gamble ~ status, teengamb)
coef(lmod2)
## (Intercept)      status 
## 11.94369928  0.09611236
plot(gamble ~ income, teengamb)
abline(lm(gamble ~ income, teengamb))

lmod3 <- lm(gamble ~ income, teengamb)
coef(lmod3)
## (Intercept)      income 
##   -3.835302    4.455848

Below are our values from above. + Verbal: intercept = 26.13, verbal = -1.46 + Status: intercept = 11.94, status = .096 + Income: intercept = -3.84, income = 4.46

From our calculations above, we can see that most of the points do not fall on the best fit line. This leads me to believe that my intial thoughts about this dataset were correct. Due to the skewness of the inital histogram, it looks like a linear regression was not the way to go.

  1. The dataset prostate is from a study on 97 men with prostate cancer who were due to recieve a radical prostatectomy. Make a numerical and graphical summary of the data as in the first question.
data(prostate)
head(prostate)
##       lcavol lweight age      lbph svi      lcp gleason pgg45     lpsa
## 1 -0.5798185  2.7695  50 -1.386294   0 -1.38629       6     0 -0.43078
## 2 -0.9942523  3.3196  58 -1.386294   0 -1.38629       6     0 -0.16252
## 3 -0.5108256  2.6912  74 -1.386294   0 -1.38629       7    20 -0.16252
## 4 -1.2039728  3.2828  58 -1.386294   0 -1.38629       6     0 -0.16252
## 5  0.7514161  3.4324  62 -1.386294   0 -1.38629       6     0  0.37156
## 6 -1.0498221  3.2288  50 -1.386294   0 -1.38629       6     0  0.76547
tail(prostate)
##      lcavol lweight age      lbph svi      lcp gleason pgg45    lpsa
## 92 2.532903  3.6776  61  1.348073   1 -1.38629       7    15 4.12955
## 93 2.830268  3.8764  68 -1.386294   1  1.32176       7    60 4.38515
## 94 3.821004  3.8969  44 -1.386294   1  2.16905       7    40 4.68444
## 95 2.907447  3.3962  52 -1.386294   1  2.46385       7    10 5.14312
## 96 2.882564  3.7739  68  1.558145   1  1.55814       7    80 5.47751
## 97 3.471967  3.9750  68  0.438255   1  2.90417       7    20 5.58293
summary(prostate)
##      lcavol           lweight           age             lbph        
##  Min.   :-1.3471   Min.   :2.375   Min.   :41.00   Min.   :-1.3863  
##  1st Qu.: 0.5128   1st Qu.:3.376   1st Qu.:60.00   1st Qu.:-1.3863  
##  Median : 1.4469   Median :3.623   Median :65.00   Median : 0.3001  
##  Mean   : 1.3500   Mean   :3.653   Mean   :63.87   Mean   : 0.1004  
##  3rd Qu.: 2.1270   3rd Qu.:3.878   3rd Qu.:68.00   3rd Qu.: 1.5581  
##  Max.   : 3.8210   Max.   :6.108   Max.   :79.00   Max.   : 2.3263  
##       svi              lcp             gleason          pgg45       
##  Min.   :0.0000   Min.   :-1.3863   Min.   :6.000   Min.   :  0.00  
##  1st Qu.:0.0000   1st Qu.:-1.3863   1st Qu.:6.000   1st Qu.:  0.00  
##  Median :0.0000   Median :-0.7985   Median :7.000   Median : 15.00  
##  Mean   :0.2165   Mean   :-0.1794   Mean   :6.753   Mean   : 24.38  
##  3rd Qu.:0.0000   3rd Qu.: 1.1786   3rd Qu.:7.000   3rd Qu.: 40.00  
##  Max.   :1.0000   Max.   : 2.9042   Max.   :9.000   Max.   :100.00  
##       lpsa        
##  Min.   :-0.4308  
##  1st Qu.: 1.7317  
##  Median : 2.5915  
##  Mean   : 2.4784  
##  3rd Qu.: 3.0564  
##  Max.   : 5.5829

For this question, I’m going to go ahead and skip straight to plotting and explain what I see with the plots.

prostate$gleason <- factor(prostate$gleason)
prostate$svi <- factor(prostate$svi)
summary(prostate)
##      lcavol           lweight           age             lbph        
##  Min.   :-1.3471   Min.   :2.375   Min.   :41.00   Min.   :-1.3863  
##  1st Qu.: 0.5128   1st Qu.:3.376   1st Qu.:60.00   1st Qu.:-1.3863  
##  Median : 1.4469   Median :3.623   Median :65.00   Median : 0.3001  
##  Mean   : 1.3500   Mean   :3.653   Mean   :63.87   Mean   : 0.1004  
##  3rd Qu.: 2.1270   3rd Qu.:3.878   3rd Qu.:68.00   3rd Qu.: 1.5581  
##  Max.   : 3.8210   Max.   :6.108   Max.   :79.00   Max.   : 2.3263  
##  svi         lcp          gleason     pgg45             lpsa        
##  0:76   Min.   :-1.3863   6:35    Min.   :  0.00   Min.   :-0.4308  
##  1:21   1st Qu.:-1.3863   7:56    1st Qu.:  0.00   1st Qu.: 1.7317  
##         Median :-0.7985   8: 1    Median : 15.00   Median : 2.5915  
##         Mean   :-0.1794   9: 5    Mean   : 24.38   Mean   : 2.4784  
##         3rd Qu.: 1.1786           3rd Qu.: 40.00   3rd Qu.: 3.0564  
##         Max.   : 2.9042           Max.   :100.00   Max.   : 5.5829
#this dataset did not let me know what each of the categories are.

I would like to take a look at the histogram to see if this set is more viable that the one from number 1

hist(prostate$lcavol, xlab = "Cancer Volume", main = "")

plot(density(prostate$lcavol, na.rm = TRUE), main = "")

plot(sort(prostate$lcavol), ylab = "Sorted Cancer Volume")

Our histogram is, again a little funny looking and our density plot is also skew, although not quite as skewwed as the one from problem #1.

plot(lcavol ~ lweight, prostate)
abline(lm(lcavol ~ lweight, prostate))

lmod4 <- lm(lcavol ~ lweight, prostate)
coef(lmod4)
## (Intercept)     lweight 
##  -0.3328410   0.4607156
plot(lcavol ~ age, prostate)
abline(lm(lcavol ~ age, prostate))

lmod5 <- lm(lcavol ~ age, prostate)
coef(lmod5)
## (Intercept)         age 
## -0.92485702  0.03561938
plot(lcavol ~ lcp, prostate)
abline(lm(lcavol ~ lcp, prostate))

lmod6 <- lm(lcavol ~ lcp, prostate)
coef(lmod6)
## (Intercept)         lcp 
##   1.4521105   0.5692395
plot(lcavol ~ lbph, prostate)
abline(lm(lcavol ~ lbph, prostate))

lmod7 <- lm(lcavol ~ lbph, prostate)
coef(lmod7)
## (Intercept)        lbph 
##  1.34777980  0.02221871
plot(lcavol ~ lpsa, prostate)
abline(lm(lcavol ~ lpsa, prostate))

lmod8 <- lm(lcavol ~ lpsa, prostate)
coef(lmod8)
## (Intercept)        lpsa 
##  -0.5085802   0.7499191

From the plots we can see that lpsa and lcp have the most data points that are along our best fit line.

  1. The dataset sat comes from a study entitled “Getting What You Pay For: The Debate Over Equity in Public School Expenditures.” Make a numerical and graphical summary of the data as in the first question.
data(sat)
head(sat)
##            expend ratio salary takers verbal math total
## Alabama     4.405  17.2 31.144      8    491  538  1029
## Alaska      8.963  17.6 47.951     47    445  489   934
## Arizona     4.778  19.3 32.175     27    448  496   944
## Arkansas    4.459  17.1 28.934      6    482  523  1005
## California  4.992  24.0 41.078     45    417  485   902
## Colorado    5.443  18.4 34.571     29    462  518   980
tail(sat)
##               expend ratio salary takers verbal math total
## Vermont        6.750  13.8 35.406     68    429  472   901
## Virginia       5.327  14.6 33.987     65    428  468   896
## Washington     5.906  20.2 36.151     48    443  494   937
## West Virginia  6.107  14.8 31.944     17    448  484   932
## Wisconsin      6.930  15.9 37.746      9    501  572  1073
## Wyoming        6.160  14.9 31.285     10    476  525  1001
summary(sat)
##      expend          ratio           salary          takers     
##  Min.   :3.656   Min.   :13.80   Min.   :25.99   Min.   : 4.00  
##  1st Qu.:4.882   1st Qu.:15.22   1st Qu.:30.98   1st Qu.: 9.00  
##  Median :5.768   Median :16.60   Median :33.29   Median :28.00  
##  Mean   :5.905   Mean   :16.86   Mean   :34.83   Mean   :35.24  
##  3rd Qu.:6.434   3rd Qu.:17.57   3rd Qu.:38.55   3rd Qu.:63.00  
##  Max.   :9.774   Max.   :24.30   Max.   :50.05   Max.   :81.00  
##      verbal           math           total       
##  Min.   :401.0   Min.   :443.0   Min.   : 844.0  
##  1st Qu.:427.2   1st Qu.:474.8   1st Qu.: 897.2  
##  Median :448.0   Median :497.5   Median : 945.5  
##  Mean   :457.1   Mean   :508.8   Mean   : 965.9  
##  3rd Qu.:490.2   3rd Qu.:539.5   3rd Qu.:1032.0  
##  Max.   :516.0   Max.   :592.0   Max.   :1107.0
hist(sat$expend, xlab = "Expenditure", main = "")

plot(density(sat$expend, na.rm = TRUE), main = "")

plot(sort(sat$expend), ylab = "Sorted Expenditure")

plot(expend ~ ratio, sat)
abline(lm(expend ~ ratio, sat))

lmod9 <- lm(expend ~ ratio, sat)
coef(lmod9)
## (Intercept)       ratio 
##   9.6663699  -0.2231053
plot(expend ~ salary, sat)
abline(lm(expend ~ salary, sat))

lmod10 <- lm(expend ~ salary, sat)
coef(lmod10)
## (Intercept)      salary 
##  -1.0436300   0.1995149
plot(expend ~ takers, sat)
abline(lm(expend ~ takers, sat))

lmod11 <- lm(expend ~ takers, sat)
coef(lmod11)
## (Intercept)      takers 
##  4.84178693  0.03017801
plot(expend ~ verbal, sat)
abline(lm(expend ~ verbal, sat))

lmod12 <- lm(expend ~ verbal, sat)
coef(lmod12)
## (Intercept)      verbal 
## 13.16756189 -0.01588638
plot(expend ~ math, sat)
abline(lm(expend ~ math, sat))

lmod13 <- lm(expend ~ math, sat)
coef(lmod13)
## (Intercept)        math 
## 11.93123808 -0.01184398
plot(expend ~ total, sat)
abline(lm(expend ~ total, sat))

lmod14 <- lm(expend ~ total, sat)
coef(lmod14)
##  (Intercept)        total 
## 12.600271355 -0.006931228
  1. The dataset divusa contains data on divorces in the United States from 1920 to 1996. Make a numerical and graphical summary of the data as in the first question.
data(divusa)
head(divusa)
##   year divorce unemployed femlab marriage birth military
## 1 1920     8.0        5.2  22.70     92.0 117.9   3.2247
## 2 1921     7.2       11.7  22.79     83.0 119.8   3.5614
## 3 1922     6.6        6.7  22.88     79.7 111.2   2.4553
## 4 1923     7.1        2.4  22.97     85.2 110.5   2.2065
## 5 1924     7.2        5.0  23.06     80.3 110.9   2.2889
## 6 1925     7.2        3.2  23.15     79.2 106.6   2.1735
tail(divusa)
##    year divorce unemployed femlab marriage birth military
## 72 1991    20.9        6.8   57.4     54.2  69.6   7.8744
## 73 1992    21.2        7.5   57.8     53.3  68.9   7.0862
## 74 1993    20.5        6.9   57.9     52.3  67.6   6.6145
## 75 1994    20.5        6.1   58.8     51.5  66.7   6.1865
## 76 1995    19.8        5.6   58.9     50.8  65.6   5.7770
## 77 1996    19.5        5.4   59.3     49.7  65.3   5.5488
summary(divusa)
##       year         divorce        unemployed         femlab     
##  Min.   :1920   Min.   : 6.10   Min.   : 1.200   Min.   :22.70  
##  1st Qu.:1939   1st Qu.: 8.70   1st Qu.: 4.200   1st Qu.:27.47  
##  Median :1958   Median :10.60   Median : 5.600   Median :37.10  
##  Mean   :1958   Mean   :13.27   Mean   : 7.173   Mean   :38.58  
##  3rd Qu.:1977   3rd Qu.:20.30   3rd Qu.: 7.500   3rd Qu.:47.80  
##  Max.   :1996   Max.   :22.80   Max.   :24.900   Max.   :59.30  
##     marriage          birth           military     
##  Min.   : 49.70   Min.   : 65.30   Min.   : 1.940  
##  1st Qu.: 61.90   1st Qu.: 68.90   1st Qu.: 3.469  
##  Median : 74.10   Median : 85.90   Median : 9.102  
##  Mean   : 72.97   Mean   : 88.89   Mean   :12.365  
##  3rd Qu.: 80.00   3rd Qu.:107.30   3rd Qu.:14.266  
##  Max.   :118.10   Max.   :122.90   Max.   :86.641
hist(divusa$divorce, xlab = "DIVORCE", main = "")

plot(density(divusa$divorce, na.rm = TRUE), main = "")

plot(sort(divusa$divorce), ylab = "Sorted Divorce")

Just looking at the plots above, I would not perform a linear regression on this dataset.

plot(divorce ~ unemployed, divusa)
abline(lm(divorce ~ unemployed, divusa))

lmod9 <- lm(divorce ~ unemployed, divusa)
coef(lmod9)
## (Intercept)  unemployed 
##   14.954236   -0.234974
plot(divorce ~ marriage, divusa)
abline(lm(divorce ~ marriage, divusa))

lmod10 <- lm(divorce ~ marriage, divusa)
coef(lmod10)
## (Intercept)    marriage 
##  30.1081994  -0.2307625
plot(divorce ~ femlab, divusa)
abline(lm(divorce ~ femlab, divusa))

lmod11 <- lm(divorce ~ femlab, divusa)
coef(lmod11)
## (Intercept)      femlab 
##  -3.6552722   0.4386697
plot(divorce ~ birth, divusa)
abline(lm(divorce ~ birth, divusa))

lmod12 <- lm(divorce ~ birth, divusa)
coef(lmod12)
## (Intercept)       birth 
##   31.905774   -0.209667
plot(divorce ~ military, divusa)
abline(lm(divorce ~ military, divusa))

lmod13 <- lm(divorce ~ military, divusa)
coef(lmod13)
##  (Intercept)     military 
## 13.181175227  0.007089168