sim_based

#çoklu bağlantı var mı?
assumptions <- function(x) {
  library(tidyr)
  library(dplyr)
  descr <- as.data.frame(matrix(NA, nrow = 7, ncol = ncol(x)))
  rownames(descr) <- c("Number_of_Observations",
                       "Number_of_missing_values",
                       "min_value", "max_value",
                       "median_value","_skewness_", "_kurtosis_")
  descriptives <- pastecs::stat.desc(x) #calculate descriptive statistics.
  
  ##This function taken from https://www.r-bloggers.com/computing-the-mode-in-r/
  #Mode = function(x) {
  #  ta = table(x)
  #  tam = max(ta)
  #  if (all(ta == tam))
  #    mod = NA
  #  else
  #    if(is.numeric(x))
  #      mod = as.numeric(names(ta)[ta == tam])
  #  else
  #    mod = names(ta)[ta == tam]
  #  return(mod)
  #}
  
  #calculate modes
  descr[1:4, ] <- descriptives[c(1, 3, 4, 5), ]
  descr[5, ] <- descriptives[8, ]
  descr[6, ] <- moments::skewness(x, na.rm = T)
  descr[7, ] <- moments::kurtosis(x, na.rm = T)-3
  
  
  #Calculate VIF and Tolerance values
  #To obtain IF and TV values describe the model
  x_new <- x
  x_new$rn <- 1:nrow(x)
  model_for_collinearity <- lm(
    as.formula(paste(colnames(x_new)[ncol(x_new)], "~",
                     paste(colnames(x_new)[1:(ncol(x_new)-1)], collapse = "+"),
                     sep = ""
    )), data = x_new)
  mc_VIF_TOL <- as.data.frame(mctest::mctest(model_for_collinearity,
                                             type = "i")$idiags[,1:2]) #calculate VIF and Tollerance values
  #Calculate Condition Index
  mc_CI <- mctest::eigprop(mod = model_for_collinearity)$ci
  #A data frame for summary of multicollinearity
  mc_control <- data.frame(min_VIF = min(mc_VIF_TOL$VIF),
                           max_VIF = max(mc_VIF_TOL$VIF),
                           min_TOL = min(mc_VIF_TOL$TOL),
                           max_TOL = max(mc_VIF_TOL$TOL),
                           min_CI = min(mc_CI),
                           max_CI = max(mc_CI)) #giving a summary of multicollinearity
  
  #Mahalanobis Distance Calculation
  #To calculate mahalanobis distance, missing values are not accepted.
  distance <- as.matrix(mahalanobis(x, colMeans(x), cov = cov(x)))
  #Those with Mahalanobis Distance p values bigger than 0.001 were considered as outliers.
  Mah_significant <- x %>%
    transmute(row_number = 1:nrow(x),
              Mahalanobis_distance = distance,
              Mah_p_value = pchisq(distance, df = ncol(x), lower.tail = F)) %>%
    filter(Mah_p_value <= 0.001)
  #Calculate Mardia's kurtosis value for multivariate normality
  mardia_kurt <- psych::mardia(x, plot = F)
  
  #Return a list consist of descriptive statistics, multicollinearity, multivariate normality and Mahalanobis distance for multivariate outliers
  return(list(descriptives = round(descr, 2),
              multicollinearity = round(mc_control, 2),
              Mah_significant = Mah_significant,
              n_outlier = nrow(Mah_significant),
              Mardia_Kurtosis = mardia_kurt$kurtosis,
              Mardia_Kurtosis_p_value = mardia_kurt$p.kurt )) }

#### Better Regression
n <- 100 # sample size
df <- n-2 # should match SR
m <- 50 # set mean of x
sd <- 10 # set sd of x
b0 <- 30 # intercept
b1 <- .50 # slope
e <- rnorm(n,sd=3) # RSE
x <- rnorm(n,m,sd) # predictor
y <- b0 + (b1*x) + e # linear equation
fit <- lm(y ~ x) # fit regression
summary(fit) # summarize

## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.2921 -2.1723  0.2968  1.9885  9.1183 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 28.34474    1.62600   17.43   <2e-16 ***
## x            0.53682    0.03199   16.78   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.348 on 98 degrees of freedom
## Multiple R-squared:  0.7418, Adjusted R-squared:  0.7392 
## F-statistic: 281.5 on 1 and 98 DF,  p-value: < 2.2e-16

cor(x,y)

## [1] 0.8612761

plot(x,y,abline(fit,col="red",lwd=2))

#### Create Noisier Estimates
n <- 100 # sample size
df <- n-2 # should match SR
m <- 50 # set mean of x
sd <- 10 # set sd of x
b0 <- 30 # intercept
b1 <- .50 # slope
e <- rnorm(n,sd=10) # RSE
x <- rnorm(n,m,sd) # predictor
y <- b0 + (b1*x) + e # linear equation
fit <- lm(y ~ x) # fit regression
summary(fit) # summarize

## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -28.0729  -5.5547  -0.0681   4.6105  24.7755 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 34.65155    4.55036   7.615 1.66e-11 ***
## x            0.41298    0.08807   4.689 8.87e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.149 on 98 degrees of freedom
## Multiple R-squared:  0.1833, Adjusted R-squared:  0.1749 
## F-statistic: 21.99 on 1 and 98 DF,  p-value: 8.874e-06

cor(x,y)

## [1] 0.4280914

plot(x,y,abline(fit,col="red",lwd=2))

#### Predictor Parameters ####
n <- 100
m <- 50
sd <- 10
x1 <- rnorm(n,m,sd)
x2 <- rnorm(n,m,sd)

#### Linear Equation Setup ####
b0 <- 30
b1 <- .50
b2 <- .70
e <- rnorm(n,sd=1)
y <- b0 + (b1*x1) + (b2*x2)+ e

#### Save as Data Frame ####
df <- data.frame(x1,x2,y)

#### Fit and Summarize ####
fit <- lm(y ~ x1 + x2)
summary(fit)

## 
## Call:
## lm(formula = y ~ x1 + x2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.74039 -0.70275 -0.01627  0.61685  2.78242 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 30.094467   0.742214   40.55   <2e-16 ***
## x1           0.501854   0.011592   43.29   <2e-16 ***
## x2           0.699674   0.009989   70.04   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.044 on 97 degrees of freedom
## Multiple R-squared:  0.987,  Adjusted R-squared:  0.9868 
## F-statistic:  3690 on 2 and 97 DF,  p-value: < 2.2e-16

cor(df)

##            x1         x2         y
## x1 1.00000000 0.08926959 0.5752486
## x2 0.08926959 1.00000000 0.8581287
## y  0.57524862 0.85812868 1.0000000

plot(y ~ x1 + x2, data = df)

### correlated predictors
library(MASS)
n <- 100  # sample size

# Means of the predictors
means <- c(50, 50)

# Standard deviations for predictors
sd1 <- 5
sd2 <- 10

# Desired correlation between predictors
r <- 0.50

# Covariance matrix calculation
cov_matrix <- matrix(c(sd1^2, r*sd1*sd2,
                       r*sd1*sd2, sd2^2), 
                     nrow=2)
# Generate correlated predictors
set.seed(123)
data <- mvrnorm(n, mu = means, Sigma = cov_matrix)

x1 <- data[,1]
x2 <- data[,2]

b0 <- 30
b1 <- 0.50
b2 <- 0.70
e <- rnorm(n, sd=1)

y <- b0 + b1*x1 + b2*x2 + e

df <- data.frame(x1, x2, y)

fit <- lm(y ~ x1 + x2, data=df)
summary(fit)

## 
## Call:
## lm(formula = y ~ x1 + x2, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.8730 -0.6607 -0.1245  0.6214  2.0798 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 31.12586    1.00524   30.96   <2e-16 ***
## x1           0.49082    0.02274   21.59   <2e-16 ***
## x2           0.68936    0.01214   56.77   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9513 on 97 degrees of freedom
## Multiple R-squared:  0.9854, Adjusted R-squared:  0.9851 
## F-statistic:  3263 on 2 and 97 DF,  p-value: < 2.2e-16

cor(df) #x1 ve x2 arasındaki ilişki 0.70 olacak şekilde ayarlandı. Ancak tam 0.70 elde etmek gerçekçi olmuyor.

##           x1        x2         y
## x1 1.0000000 0.4978485 0.7062469
## x2 0.4978485 1.0000000 0.9565608
## y  0.7062469 0.9565608 1.0000000

assume <- assumptions(df[,1:2])

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:MASS':
## 
##     select

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

## Warning: Using one column matrices in `filter()` was deprecated in dplyr 1.1.0.
## ℹ Please use one dimensional logical vectors instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

assume$multicollinearity

##   min_VIF max_VIF min_TOL max_TOL min_CI max_CI
## 1    1.33    1.33    0.75    0.75      1  26.95

#### Complex Models
x <- runif(500)
y <- log(x) + rnorm(500,sd=.5)
plot(x,y,
     main="Exponential Decay Model",
     xlab = "Simulated X",
     ylab = "Simulated Y")

#### Fit Regression ####
poor.fit <- lm(y ~ x)

#### Add Regression Line ####
plot(x,y,
     main="Exponential Decay Model",
     xlab = "Simulated X",
     ylab = "Simulated Y")

abline(poor.fit,
       col = "red",
       lwd=2)

#### Plog Again and Run Regression ####
plot(x,y,
     main="Exponential Decay Model",
     xlab = "Simulated X",
     ylab = "Simulated Y")

#### Plog Again and Run Regression ####
plot(x,y,
     main="Exponential Decay Model",
     xlab = "Simulated X",
     ylab = "Simulated Y")

fit <- lm(y ~ log(x))

#### Create X Sequence ####
x.new <- seq(
  min(x),max(x),length.out=100
)

#### Plot Line ####
new <- data.frame(x = x.new)
pred <- predict(fit, newdata = new)
lines(x.new,pred,col="red",lwd=2)

sim_based_reg

Tugay Kaçak

2025-04-02