#load the required packages
library(alr4)
## Loading required package: car
## Loading required package: carData
## Loading required package: effects
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
library(ggplot2)
#load the data set
data( UN11 )
head( UN11 )
## region group fertility ppgdp lifeExpF pctUrban
## Afghanistan Asia other 5.968 499.0 49.49 23
## Albania Europe other 1.525 3677.2 80.40 53
## Algeria Africa africa 2.142 4473.0 75.00 67
## Angola Africa africa 5.135 4321.9 53.17 59
## Anguilla Caribbean other 2.000 13750.1 81.10 100
## Argentina Latin Amer other 2.172 9162.1 79.89 93
#Sulution
#The predictor variable is ppgdp while the response variable is fertility.
ggplot ( UN11, aes (y=fertility, x=ppgdp ) ) +
geom_point()+
geom_smooth( method=lm, se=FALSE)+
geom_smooth( method=loess, se= FALSE, color="blue")+
xlab("ppgdp")+
ylab(" fertility ")+
ggtitle( "Fertility versus ppgdp")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
##In this graph, simple linear regression is not a good summary. The variance does not seem to be constant, and the mean function does not seem to be linear.
rm( list=ls() ) # Clear environment
require( alr4 ) # UN11 is in this oackage
require( ggplot2 ) # For plotting
data( UN11 ) # Get data
head( UN11 ) # Verify data
## region group fertility ppgdp lifeExpF pctUrban
## Afghanistan Asia other 5.968 499.0 49.49 23
## Albania Europe other 1.525 3677.2 80.40 53
## Algeria Africa africa 2.142 4473.0 75.00 67
## Angola Africa africa 5.135 4321.9 53.17 59
## Anguilla Caribbean other 2.000 13750.1 81.10 100
## Argentina Latin Amer other 2.172 9162.1 79.89 93
#Solution
#In the graphic below, base-ten logarithms were indeed used, although
they just changed the labels for the tick marks, not the shape of the
graph.
# Transformations start here. They will be added
# to the the UN11 data frame.
UN11$logppgdp <- with( UN11, log(ppgdp))
UN11$logfertility <- with( UN11, log( fertility ))
ggplot ( UN11, aes(y=logfertility , x=logppgdp) ) +
geom_point() +
geom_smooth( method=lm, se=FALSE) +
geom_smooth( method=loess, se= FALSE, color="red") +
xlab( "logppgdp") +
ylab( "logfertility") +
ggtitle( "Fertility versus ppgdp")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
##Since the mean function seems to be linear with a relatively
constant variance, simple linear regression is much more appropriate in
this situation. The probable exception pertains to areas where the
log(Fertility) is typically larger than what would be predicted by a
straightforward linear regression and where the log(PPGDP) is very
tiny.
# d)
head(UN11) # Trust but verify .
## region group fertility ppgdp lifeExpF pctUrban logppgdp
## Afghanistan Asia other 5.968 499.0 49.49 23 6.212606
## Albania Europe other 1.525 3677.2 80.40 53 8.209907
## Algeria Africa africa 2.142 4473.0 75.00 67 8.405815
## Angola Africa africa 5.135 4321.9 53.17 59 8.371450
## Anguilla Caribbean other 2.000 13750.1 81.10 100 9.528801
## Argentina Latin Amer other 2.172 9162.1 79.89 93 9.122831
## logfertility
## Afghanistan 1.7864119
## Albania 0.4219944
## Algeria 0.7617400
## Angola 1.6360798
## Anguilla 0.6931472
## Argentina 0.7756484
model1<-lm(fertility~ppgdp, data = UN11)
summary(model1)
##
## Call:
## lm(formula = fertility ~ ppgdp, data = UN11)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.9006 -0.8801 -0.3547 0.6749 3.7585
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.178e+00 1.048e-01 30.331 < 2e-16 ***
## ppgdp -3.201e-05 4.655e-06 -6.877 7.9e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.206 on 197 degrees of freedom
## Multiple R-squared: 0.1936, Adjusted R-squared: 0.1895
## F-statistic: 47.29 on 1 and 197 DF, p-value: 7.903e-11
#Solution
#The values of R^2 is 0.1936 while the value of adjusted R^2 is 0.1895.
This means that only 19% of the total variation of the model is
explained by the dataset.This is a very poor fit.
anova(model1)
## Analysis of Variance Table
##
## Response: fertility
## Df Sum Sq Mean Sq F value Pr(>F)
## ppgdp 1 68.785 68.785 47.293 7.903e-11 ***
## Residuals 197 286.526 1.454
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Solution
#From the twst results the hypotheis is indeed significant since the
p-value 7.903e-11 is less than 0.05 level of significance.
#The predict function is now used, and the predictor variable is set in the newdata argument. Additionally, we choose the default 0.95 confidence level and set the interval type to “predict”.
data("UN11")
UN11$logppgdp <- data.frame(with( UN11, log(ppgdp)))
UN11$logfertility <- data.frame(with( UN11, log( fertility )))
max(UN11$logfertility)
## [1] 1.935138
min(UN11$logfertility)
## [1] 0.1257512
#Solution
# The 1st locality has highest value of fertility is 1.935138, and the
locality with the lowest value of fertility is 0.1257512