#çoklu bağlantı var mı?
assumptions <- function(x) {
library(tidyr)
library(dplyr)
descr <- as.data.frame(matrix(NA, nrow = 7, ncol = ncol(x)))
rownames(descr) <- c("Number_of_Observations",
"Number_of_missing_values",
"min_value", "max_value",
"median_value","_skewness_", "_kurtosis_")
descriptives <- pastecs::stat.desc(x) #calculate descriptive statistics.
##This function taken from https://www.r-bloggers.com/computing-the-mode-in-r/
#Mode = function(x) {
# ta = table(x)
# tam = max(ta)
# if (all(ta == tam))
# mod = NA
# else
# if(is.numeric(x))
# mod = as.numeric(names(ta)[ta == tam])
# else
# mod = names(ta)[ta == tam]
# return(mod)
#}
#calculate modes
descr[1:4, ] <- descriptives[c(1, 3, 4, 5), ]
descr[5, ] <- descriptives[8, ]
descr[6, ] <- moments::skewness(x, na.rm = T)
descr[7, ] <- moments::kurtosis(x, na.rm = T)-3
#Calculate VIF and Tolerance values
#To obtain IF and TV values describe the model
x_new <- x
x_new$rn <- 1:nrow(x)
model_for_collinearity <- lm(
as.formula(paste(colnames(x_new)[ncol(x_new)], "~",
paste(colnames(x_new)[1:(ncol(x_new)-1)], collapse = "+"),
sep = ""
)), data = x_new)
mc_VIF_TOL <- as.data.frame(mctest::mctest(model_for_collinearity,
type = "i")$idiags[,1:2]) #calculate VIF and Tollerance values
#Calculate Condition Index
mc_CI <- mctest::eigprop(mod = model_for_collinearity)$ci
#A data frame for summary of multicollinearity
mc_control <- data.frame(min_VIF = min(mc_VIF_TOL$VIF),
max_VIF = max(mc_VIF_TOL$VIF),
min_TOL = min(mc_VIF_TOL$TOL),
max_TOL = max(mc_VIF_TOL$TOL),
min_CI = min(mc_CI),
max_CI = max(mc_CI)) #giving a summary of multicollinearity
#Mahalanobis Distance Calculation
#To calculate mahalanobis distance, missing values are not accepted.
distance <- as.matrix(mahalanobis(x, colMeans(x), cov = cov(x)))
#Those with Mahalanobis Distance p values bigger than 0.001 were considered as outliers.
Mah_significant <- x %>%
transmute(row_number = 1:nrow(x),
Mahalanobis_distance = distance,
Mah_p_value = pchisq(distance, df = ncol(x), lower.tail = F)) %>%
filter(Mah_p_value <= 0.001)
#Calculate Mardia's kurtosis value for multivariate normality
mardia_kurt <- psych::mardia(x, plot = F)
#Return a list consist of descriptive statistics, multicollinearity, multivariate normality and Mahalanobis distance for multivariate outliers
return(list(descriptives = round(descr, 2),
multicollinearity = round(mc_control, 2),
Mah_significant = Mah_significant,
n_outlier = nrow(Mah_significant),
Mardia_Kurtosis = mardia_kurt$kurtosis,
Mardia_Kurtosis_p_value = mardia_kurt$p.kurt )) }
#### Better Regression
n <- 100 # sample size
df <- n-2 # should match SR
m <- 50 # set mean of x
sd <- 10 # set sd of x
b0 <- 30 # intercept
b1 <- .50 # slope
e <- rnorm(n,sd=3) # RSE
x <- rnorm(n,m,sd) # predictor
y <- b0 + (b1*x) + e # linear equation
fit <- lm(y ~ x) # fit regression
summary(fit) # summarize
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.2921 -2.1723 0.2968 1.9885 9.1183
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 28.34474 1.62600 17.43 <2e-16 ***
## x 0.53682 0.03199 16.78 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.348 on 98 degrees of freedom
## Multiple R-squared: 0.7418, Adjusted R-squared: 0.7392
## F-statistic: 281.5 on 1 and 98 DF, p-value: < 2.2e-16
cor(x,y)
## [1] 0.8612761
plot(x,y,abline(fit,col="red",lwd=2))

#### Create Noisier Estimates
n <- 100 # sample size
df <- n-2 # should match SR
m <- 50 # set mean of x
sd <- 10 # set sd of x
b0 <- 30 # intercept
b1 <- .50 # slope
e <- rnorm(n,sd=10) # RSE
x <- rnorm(n,m,sd) # predictor
y <- b0 + (b1*x) + e # linear equation
fit <- lm(y ~ x) # fit regression
summary(fit) # summarize
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.0729 -5.5547 -0.0681 4.6105 24.7755
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.65155 4.55036 7.615 1.66e-11 ***
## x 0.41298 0.08807 4.689 8.87e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.149 on 98 degrees of freedom
## Multiple R-squared: 0.1833, Adjusted R-squared: 0.1749
## F-statistic: 21.99 on 1 and 98 DF, p-value: 8.874e-06
cor(x,y)
## [1] 0.4280914
plot(x,y,abline(fit,col="red",lwd=2))

#### Predictor Parameters ####
n <- 100
m <- 50
sd <- 10
x1 <- rnorm(n,m,sd)
x2 <- rnorm(n,m,sd)
#### Linear Equation Setup ####
b0 <- 30
b1 <- .50
b2 <- .70
e <- rnorm(n,sd=1)
y <- b0 + (b1*x1) + (b2*x2)+ e
#### Save as Data Frame ####
df <- data.frame(x1,x2,y)
#### Fit and Summarize ####
fit <- lm(y ~ x1 + x2)
summary(fit)
##
## Call:
## lm(formula = y ~ x1 + x2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.74039 -0.70275 -0.01627 0.61685 2.78242
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30.094467 0.742214 40.55 <2e-16 ***
## x1 0.501854 0.011592 43.29 <2e-16 ***
## x2 0.699674 0.009989 70.04 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.044 on 97 degrees of freedom
## Multiple R-squared: 0.987, Adjusted R-squared: 0.9868
## F-statistic: 3690 on 2 and 97 DF, p-value: < 2.2e-16
cor(df)
## x1 x2 y
## x1 1.00000000 0.08926959 0.5752486
## x2 0.08926959 1.00000000 0.8581287
## y 0.57524862 0.85812868 1.0000000
plot(y ~ x1 + x2, data = df)


### correlated predictors
library(MASS)
n <- 100 # sample size
# Means of the predictors
means <- c(50, 50)
# Standard deviations for predictors
sd1 <- 5
sd2 <- 10
# Desired correlation between predictors
r <- 0.50
# Covariance matrix calculation
cov_matrix <- matrix(c(sd1^2, r*sd1*sd2,
r*sd1*sd2, sd2^2),
nrow=2)
# Generate correlated predictors
set.seed(123)
data <- mvrnorm(n, mu = means, Sigma = cov_matrix)
x1 <- data[,1]
x2 <- data[,2]
b0 <- 30
b1 <- 0.50
b2 <- 0.70
e <- rnorm(n, sd=1)
y <- b0 + b1*x1 + b2*x2 + e
df <- data.frame(x1, x2, y)
fit <- lm(y ~ x1 + x2, data=df)
summary(fit)
##
## Call:
## lm(formula = y ~ x1 + x2, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.8730 -0.6607 -0.1245 0.6214 2.0798
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 31.12586 1.00524 30.96 <2e-16 ***
## x1 0.49082 0.02274 21.59 <2e-16 ***
## x2 0.68936 0.01214 56.77 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9513 on 97 degrees of freedom
## Multiple R-squared: 0.9854, Adjusted R-squared: 0.9851
## F-statistic: 3263 on 2 and 97 DF, p-value: < 2.2e-16
cor(df) #x1 ve x2 arasındaki ilişki 0.70 olacak şekilde ayarlandı. Ancak tam 0.70 elde etmek gerçekçi olmuyor.
## x1 x2 y
## x1 1.0000000 0.4978485 0.7062469
## x2 0.4978485 1.0000000 0.9565608
## y 0.7062469 0.9565608 1.0000000
assume <- assumptions(df[,1:2])
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Warning: Using one column matrices in `filter()` was deprecated in dplyr 1.1.0.
## ℹ Please use one dimensional logical vectors instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
assume$multicollinearity
## min_VIF max_VIF min_TOL max_TOL min_CI max_CI
## 1 1.33 1.33 0.75 0.75 1 26.95
#### Complex Models
x <- runif(500)
y <- log(x) + rnorm(500,sd=.5)
plot(x,y,
main="Exponential Decay Model",
xlab = "Simulated X",
ylab = "Simulated Y")

#### Fit Regression ####
poor.fit <- lm(y ~ x)
#### Add Regression Line ####
plot(x,y,
main="Exponential Decay Model",
xlab = "Simulated X",
ylab = "Simulated Y")
abline(poor.fit,
col = "red",
lwd=2)

#### Plog Again and Run Regression ####
plot(x,y,
main="Exponential Decay Model",
xlab = "Simulated X",
ylab = "Simulated Y")

#### Plog Again and Run Regression ####
plot(x,y,
main="Exponential Decay Model",
xlab = "Simulated X",
ylab = "Simulated Y")
fit <- lm(y ~ log(x))
#### Create X Sequence ####
x.new <- seq(
min(x),max(x),length.out=100
)
#### Plot Line ####
new <- data.frame(x = x.new)
pred <- predict(fit, newdata = new)
lines(x.new,pred,col="red",lwd=2)
