#load the required packages

library(alr4)

## Loading required package: car

## Loading required package: carData

## Loading required package: effects

## lattice theme set by effectsTheme()
## See ?effectsTheme for details.

library(ggplot2)

#load the data set

data( UN11 )
head( UN11 )

##                 region  group fertility   ppgdp lifeExpF pctUrban
## Afghanistan       Asia  other     5.968   499.0    49.49       23
## Albania         Europe  other     1.525  3677.2    80.40       53
## Algeria         Africa africa     2.142  4473.0    75.00       67
## Angola          Africa africa     5.135  4321.9    53.17       59
## Anguilla     Caribbean  other     2.000 13750.1    81.10      100
## Argentina   Latin Amer  other     2.172  9162.1    79.89       93

a) Identify the predictor and the response.

#Sulution

#The predictor variable is ppgdp while the response variable is fertility.

b)

ggplot ( UN11, aes (y=fertility, x=ppgdp ) ) +
  geom_point()+
  geom_smooth( method=lm, se=FALSE)+
  geom_smooth( method=loess, se= FALSE, color="blue")+
  xlab("ppgdp")+
  ylab(" fertility ")+
  ggtitle( "Fertility versus ppgdp")

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

##In this graph, simple linear regression is not a good summary. The variance does not seem to be constant, and the mean function does not seem to be linear.

c)

rm( list=ls() )        # Clear environment
require( alr4 )        # UN11 is in this oackage
require( ggplot2 )     # For plotting
data( UN11 )           # Get data
head( UN11 )           # Verify data

##                 region  group fertility   ppgdp lifeExpF pctUrban
## Afghanistan       Asia  other     5.968   499.0    49.49       23
## Albania         Europe  other     1.525  3677.2    80.40       53
## Algeria         Africa africa     2.142  4473.0    75.00       67
## Angola          Africa africa     5.135  4321.9    53.17       59
## Anguilla     Caribbean  other     2.000 13750.1    81.10      100
## Argentina   Latin Amer  other     2.172  9162.1    79.89       93

#Solution
#In the graphic below, base-ten logarithms were indeed used, although they just changed the labels for the tick marks, not the shape of the graph.

# Transformations start here.  They will be added
# to the the UN11 data frame.

UN11$logppgdp <- with( UN11, log(ppgdp))
UN11$logfertility <- with( UN11, log( fertility ))

ggplot ( UN11, aes(y=logfertility , x=logppgdp) ) +
  geom_point() +
  geom_smooth( method=lm, se=FALSE) +
  geom_smooth( method=loess, se= FALSE, color="red") +
  xlab( "logppgdp")   +
  ylab( "logfertility")   +
  ggtitle( "Fertility versus ppgdp")

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

##Since the mean function seems to be linear with a relatively constant variance, simple linear regression is much more appropriate in this situation. The probable exception pertains to areas where the log(Fertility) is typically larger than what would be predicted by a straightforward linear regression and where the log(PPGDP) is very tiny.
# d)

head(UN11)          # Trust but verify .

##                 region  group fertility   ppgdp lifeExpF pctUrban logppgdp
## Afghanistan       Asia  other     5.968   499.0    49.49       23 6.212606
## Albania         Europe  other     1.525  3677.2    80.40       53 8.209907
## Algeria         Africa africa     2.142  4473.0    75.00       67 8.405815
## Angola          Africa africa     5.135  4321.9    53.17       59 8.371450
## Anguilla     Caribbean  other     2.000 13750.1    81.10      100 9.528801
## Argentina   Latin Amer  other     2.172  9162.1    79.89       93 9.122831
##             logfertility
## Afghanistan    1.7864119
## Albania        0.4219944
## Algeria        0.7617400
## Angola         1.6360798
## Anguilla       0.6931472
## Argentina      0.7756484

model1<-lm(fertility~ppgdp, data = UN11)
summary(model1)

## 
## Call:
## lm(formula = fertility ~ ppgdp, data = UN11)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.9006 -0.8801 -0.3547  0.6749  3.7585 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.178e+00  1.048e-01  30.331  < 2e-16 ***
## ppgdp       -3.201e-05  4.655e-06  -6.877  7.9e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.206 on 197 degrees of freedom
## Multiple R-squared:  0.1936, Adjusted R-squared:  0.1895 
## F-statistic: 47.29 on 1 and 197 DF,  p-value: 7.903e-11

e)

#Solution
#The values of R^2 is 0.1936 while the value of adjusted R^2 is 0.1895. This means that only 19% of the total variation of the model is explained by the dataset.This is a very poor fit.

f)

anova(model1)

## Analysis of Variance Table
## 
## Response: fertility
##            Df  Sum Sq Mean Sq F value    Pr(>F)    
## ppgdp       1  68.785  68.785  47.293 7.903e-11 ***
## Residuals 197 286.526   1.454                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#Solution
#From the twst results the hypotheis is indeed significant since the p-value 7.903e-11 is less than 0.05 level of significance.

g)

Solution

We start first creating a new dataframe

#The predict function is now used, and the predictor variable is set in the newdata argument. Additionally, we choose the default 0.95 confidence level and set the interval type to “predict”.

data("UN11")
UN11$logppgdp <- data.frame(with( UN11, log(ppgdp)))
UN11$logfertility <- data.frame(with( UN11, log( fertility )))
max(UN11$logfertility)

## [1] 1.935138

min(UN11$logfertility)

## [1] 0.1257512

#Solution
# The 1st locality has highest value of fertility is 1.935138, and the locality with the lowest value of fertility is 0.1257512

HW assignment

2022-08-19

a) Identify the predictor and the response.

b)

c)

e)

f)

g)

Solution

We start first creating a new dataframe