Regression analysis is a method used to model the relationship between response variables and predictor variables. If there is only one predictor variable, then such a situation is called simple linear regression, and if there are several predictor variables, it is called multiple linear regression (Draper & Smith, 1992). Least Square Method is one of the methods used to estimate the regression parameters. Regression analysis is widely used in modeling in social, economic, and other fields. The article below will discuss the illustration of regression analysis and will present several data generation models in regression analysis.

Generate Data

1: Generate data regression

library ( MASS )
hasil<-rbind()
 b0 <- 1; b1 <- 1; b2 <- 1
 b0hat <- NULL ; b1hat <- NULL ; b2hat <- NULL
 Sigma <- matrix (c (1 ,0.9 ,0.9 ,1) ,nrow =2, ncol =2)
 mu <- c(1 ,1)
 n<-30
 for (i in 1:n) { #repeat n=30 
   eps <- rnorm (10)
   X <- mvrnorm (10 ,mu , Sigma )
   Y <- b0 + b1*X[ ,1] * b2*X[ ,2] + eps
   obj <- lm(Y~X)
   b0hat <- c(b0hat , obj$ coefficients [1])
   b1hat <- c(b1hat , obj$ coefficients [2])
   b2hat <- c(b2hat , obj$ coefficients [3])
 }
   hasil <-matrix (c( mean ( b0hat ), sd( b0hat ), mean ( b1hat ), sd(
   b1hat ), mean ( b2hat ), sd( b2hat )), nrow =2, ncol =3)
   rownames ( hasil ) <- c(" mean ","sd")
  colnames ( hasil ) <- c("b0", "b1", "b2")
  hasil
##               b0       b1        b2
##  mean  0.6323508 1.088127 0.9633439
## sd     0.7282659 1.264163 1.1241341
  obj
## 
## Call:
## lm(formula = Y ~ X)
## 
## Coefficients:
## (Intercept)           X1           X2  
##      0.5284       1.7376       0.9102

2: Generate data Multikolineritas

library(rlang)
library(car)
## Loading required package: carData
library(carData)
set.seed(1234)
x1<-runif(100,0,1) #generate data x1 with uniform (0,1)
x2<-2*x1+runif(100,9,10) #generate data x2 with x1
x3<-runif(100,1,2)+x2 #generate data x3 with x2
e<-rnorm(100,0,1) # generate residual with N(0,1)
y<-10+3*x1+5*x2+7*x3+e #function y
data<-data.frame(y,x1,x2,x3) #at data frame
ls<-lm(y~x1+x2+x3,data=data) #model regresi
summary(ls)
## 
## Call:
## lm(formula = y ~ x1 + x2 + x3, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4452 -0.5275  0.0053  0.5029  2.9251 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   7.1887     3.6477   1.971   0.0516 .  
## x1            2.4929     0.8377   2.976   0.0037 ** 
## x2            5.0790     0.5775   8.795 5.76e-14 ***
## x3            7.1972     0.4229  17.019  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.1 on 96 degrees of freedom
## Multiple R-squared:  0.9848, Adjusted R-squared:  0.9843 
## F-statistic:  2068 on 3 and 96 DF,  p-value: < 2.2e-16
vif(ls) # VIF value
##        x1        x2        x3 
##  4.458522 10.382034  7.367583

3: Generate Data Outlier

set.seed(1234)
x<-runif(90,0,2) #generate X with Unif(0,2) n=90
x1<-runif(10,0,2) #generate X1 with  Unif(0,2) n=10
e<-rnorm(90,0,1) # generate residual with N(0,1) obs=90
y<-10+5*x+e #function y
a<-data.frame(y,x,e) #at data frame
e1<-rnorm(10,2.5,1) # generate residual with N(2.5,1) obs=10
y1<-10+5*x1+e1 #function y
b<-data.frame(y=y1,x=x1,e=e1) 
data<-rbind(a,b) 
aa<-lm(y~x,data=data) #model regressi
summary(aa)
## 
## Call:
## lm(formula = y ~ x, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3267 -0.8440 -0.2210  0.5672  3.7803 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  10.4981     0.2439   43.05   <2e-16 ***
## x             4.8076     0.2354   20.42   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.306 on 98 degrees of freedom
## Multiple R-squared:  0.8097, Adjusted R-squared:  0.8078 
## F-statistic: 417.1 on 1 and 98 DF,  p-value: < 2.2e-16

4 Generate data Multikolineritas and Outlier

set.seed(1234)
library ( MASS )
hasil<-rbind()
b0 <- 1; b1 <- 1; b2 <- 1; b3 <- 1; b4 <- 1;
b0hat <- NULL ; b1hat <- NULL ; b2hat <- NULL ; b3hat <- NULL ; b4hat <- NULL
x<-runif(80,0,2) #generate X with Unif(0,2) n=90
xa<-runif(20,3,6) #generate X1 with Unif(0,2) n=10
X1<-c(x,xa)
X2<-runif(100,0,1) #generate data x2 with uniform (0,1)
X3<-3*X2 + runif(100,0,1)  #generate data X3 with x2
X4<-runif(100,0,1)+2*X3 #generate data X4 with x3
X<-cbind(X1,X2,X3,X4)
colnames ( X ) <- c("X1", "X2", "X3", "X4")
 n<-30
 for (i in 1:n) { #repeat n=30 
   eps <- rnorm (100)
   Y <- b0 + b1*X1 * b2*X2 * b3*X3 * b4*X4  + eps
   obj <- lm(Y~X1+X2+X3+X4)
   b0hat <- c(b0hat , obj$ coefficients [1])
   b1hat <- c(b1hat , obj$ coefficients [2])
   b2hat <- c(b2hat , obj$ coefficients [3])
   b3hat <- c(b3hat , obj$ coefficients [4])
   b4hat <- c(b4hat , obj$ coefficients [5])
 }
   hasil <-matrix (c( mean ( b0hat ), sd( b0hat ), mean ( b1hat ), sd(
   b1hat ), mean ( b2hat ), sd( b2hat ), mean ( b3hat ), sd( b3hat ), mean ( b4hat ), sd( b4hat )), nrow =2,       ncol = 5)
   rownames ( hasil ) <- c(" mean ","sd")
  colnames ( hasil ) <- c("b0", "b1", "b2", "b3", "b4")
  hasil
##                 b0          b1        b2         b3        b4
##  mean  -28.6439364 10.22637036 22.817783 -4.8689286 5.6775617
## sd       0.3890091  0.06579571  1.028994  0.6663952 0.3475919
  obj
## 
## Call:
## lm(formula = Y ~ X1 + X2 + X3 + X4)
## 
## Coefficients:
## (Intercept)           X1           X2           X3           X4  
##     -27.941       10.199       25.163       -5.352        5.487
  car::vif(obj) # VIF value
##        X1        X2        X3        X4 
##  1.048756 12.400769 48.156412 39.987238