library(wooldridge)
## Warning: le package 'wooldridge' a été compilé avec la version R 4.4.2
data("discrim")
str(discrim)
## 'data.frame':    410 obs. of  37 variables:
##  $ psoda   : num  1.12 1.06 1.06 1.12 1.12 ...
##  $ pfries  : num  1.06 0.91 0.91 1.02 NA ...
##  $ pentree : num  1.02 0.95 0.98 1.06 0.49 ...
##  $ wagest  : num  4.25 4.75 4.25 5 5 ...
##  $ nmgrs   : num  3 3 3 4 3 4 3 3 4 3 ...
##  $ nregs   : int  5 3 5 5 3 4 2 5 4 5 ...
##  $ hrsopen : num  16 16.5 18 16 16 15 16 17 17 18 ...
##  $ emp     : num  27.5 21.5 30 27.5 5 17.5 22.5 18.5 17 27 ...
##  $ psoda2  : num  1.11 1.05 1.05 1.15 1.04 ...
##  $ pfries2 : num  1.11 0.89 0.94 1.05 1.01 ...
##  $ pentree2: num  1.05 0.95 0.98 1.05 0.58 ...
##  $ wagest2 : num  5.05 5.05 5.05 5.05 5.05 ...
##  $ nmgrs2  : num  5 4 4 4 3 3 3 3 4 6 ...
##  $ nregs2  : int  5 3 5 5 3 4 2 5 4 5 ...
##  $ hrsopen2: num  15 17.5 17.5 16 16 15 16 16 18 17 ...
##  $ emp2    : num  27 24.5 25 NA 12 28 18.5 17 34 22 ...
##  $ compown : int  1 0 0 0 0 0 0 1 0 1 ...
##  $ chain   : int  3 1 1 3 1 1 1 3 1 3 ...
##  $ density : num  4030 4030 11400 8345 720 ...
##  $ crmrte  : num  0.0529 0.0529 0.036 0.0484 0.0616 ...
##  $ state   : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ prpblck : num  0.1712 0.1712 0.0474 0.0528 0.0345 ...
##  $ prppov  : num  0.0366 0.0366 0.0879 0.0591 0.0254 ...
##  $ prpncar : num  0.0788 0.0788 0.2694 0.1367 0.0738 ...
##  $ hseval  : num  148300 148300 169200 171600 249100 ...
##  $ nstores : int  3 3 3 3 1 2 1 1 5 5 ...
##  $ income  : num  44534 44534 41164 50366 72287 ...
##  $ county  : int  18 18 12 10 10 18 10 24 10 10 ...
##  $ lpsoda  : num  0.1133 0.0583 0.0583 0.1133 0.1133 ...
##  $ lpfries : num  0.0583 -0.0943 -0.0943 0.0198 NA ...
##  $ lhseval : num  11.9 11.9 12 12.1 12.4 ...
##  $ lincome : num  10.7 10.7 10.6 10.8 11.2 ...
##  $ ldensity: num  8.3 8.3 9.34 9.03 6.58 ...
##  $ NJ      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ BK      : int  0 1 1 0 1 1 1 0 1 0 ...
##  $ KFC     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ RR      : int  1 0 0 1 0 0 0 1 0 1 ...
##  - attr(*, "time.stamp")= chr "25 Jun 2011 23:03"
library(wooldridge)
data("discrim")
head(discrim)
##   psoda pfries pentree wagest nmgrs nregs hrsopen  emp psoda2 pfries2 pentree2
## 1  1.12   1.06    1.02   4.25     3     5    16.0 27.5   1.11    1.11     1.05
## 2  1.06   0.91    0.95   4.75     3     3    16.5 21.5   1.05    0.89     0.95
## 3  1.06   0.91    0.98   4.25     3     5    18.0 30.0   1.05    0.94     0.98
## 4  1.12   1.02    1.06   5.00     4     5    16.0 27.5   1.15    1.05     1.05
## 5  1.12     NA    0.49   5.00     3     3    16.0  5.0   1.04    1.01     0.58
## 6  1.06   0.95    1.01   4.25     4     4    15.0 17.5   1.05    0.94     1.00
##   wagest2 nmgrs2 nregs2 hrsopen2 emp2 compown chain density    crmrte state
## 1    5.05      5      5     15.0 27.0       1     3    4030 0.0528866     1
## 2    5.05      4      3     17.5 24.5       0     1    4030 0.0528866     1
## 3    5.05      4      5     17.5 25.0       0     1   11400 0.0360003     1
## 4    5.05      4      5     16.0   NA       0     3    8345 0.0484232     1
## 5    5.05      3      3     16.0 12.0       0     1     720 0.0615890     1
## 6    5.05      3      4     15.0 28.0       0     1    4424 0.0334823     1
##     prpblck    prppov   prpncar hseval nstores income county     lpsoda
## 1 0.1711542 0.0365789 0.0788428 148300       3  44534     18 0.11332869
## 2 0.1711542 0.0365789 0.0788428 148300       3  44534     18 0.05826885
## 3 0.0473602 0.0879072 0.2694298 169200       3  41164     12 0.05826885
## 4 0.0528394 0.0591227 0.1366903 171600       3  50366     10 0.11332869
## 5 0.0344800 0.0254145 0.0738020 249100       1  72287     10 0.11332869
## 6 0.0591327 0.0835001 0.1151341 148000       2  44515     18 0.05826885
##       lpfries  lhseval  lincome ldensity NJ BK KFC RR
## 1  0.05826885 11.90699 10.70401 8.301521  1  0   0  1
## 2 -0.09431065 11.90699 10.70401 8.301521  1  1   0  0
## 3 -0.09431065 12.03884 10.62532 9.341369  1  1   0  0
## 4  0.01980261 12.05292 10.82707 9.029418  1  0   0  1
## 5          NA 12.42561 11.18840 6.579251  1  1   0  0
## 6 -0.05129331 11.90497 10.70358 8.394799  1  1   0  0
model <- lm(psoda ~ prpblck + income, data = discrim)
summary(model)
## 
## Call:
## lm(formula = psoda ~ prpblck + income, data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.29401 -0.05242  0.00333  0.04231  0.44322 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 9.563e-01  1.899e-02  50.354  < 2e-16 ***
## prpblck     1.150e-01  2.600e-02   4.423 1.26e-05 ***
## income      1.603e-06  3.618e-07   4.430 1.22e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08611 on 398 degrees of freedom
##   (9 observations effacées parce que manquantes)
## Multiple R-squared:  0.06422,    Adjusted R-squared:  0.05952 
## F-statistic: 13.66 on 2 and 398 DF,  p-value: 1.835e-06
write.csv(discrim, "discrim_data.csv", row.names = FALSE)
Variable = c("psoda", "prpblck", "income", "prppov")
Description = c(
    "Fast-food restoranlarındaki soda fiyatı",
    "Bölgede yaşayan siyah nüfusun oranı",
    "Bölgenin ortalama geliri (dolar cinsinden)",
    "Yoksulluk sınırının altında yaşayan nüfusun oranı")
variable_descriptions <- data.frame
print(variable_descriptions)
## function (..., row.names = NULL, check.rows = FALSE, check.names = TRUE, 
##     fix.empty.names = TRUE, stringsAsFactors = FALSE) 
## {
##     data.row.names <- if (check.rows && is.null(row.names)) 
##         function(current, new, i) {
##             if (is.character(current)) 
##                 new <- as.character(new)
##             if (is.character(new)) 
##                 current <- as.character(current)
##             if (anyDuplicated(new)) 
##                 return(current)
##             if (is.null(current)) 
##                 return(new)
##             if (all(current == new) || all(current == "")) 
##                 return(new)
##             stop(gettextf("mismatch of row names in arguments of 'data.frame', item %d", 
##                 i), domain = NA)
##         }
##     else function(current, new, i) {
##         current %||% if (anyDuplicated(new)) {
##             warning(gettextf("some row.names duplicated: %s --> row.names NOT used", 
##                 paste(which(duplicated(new)), collapse = ",")), 
##                 domain = NA)
##             current
##         }
##         else new
##     }
##     object <- as.list(substitute(list(...)))[-1L]
##     mirn <- missing(row.names)
##     mrn <- is.null(row.names)
##     x <- list(...)
##     n <- length(x)
##     if (n < 1L) {
##         if (!mrn) {
##             if (is.object(row.names) || !is.integer(row.names)) 
##                 row.names <- as.character(row.names)
##             if (anyNA(row.names)) 
##                 stop("row names contain missing values")
##             if (anyDuplicated(row.names)) 
##                 stop(gettextf("duplicate row.names: %s", paste(unique(row.names[duplicated(row.names)]), 
##                   collapse = ", ")), domain = NA)
##         }
##         else row.names <- integer()
##         return(structure(list(), names = character(), row.names = row.names, 
##             class = "data.frame"))
##     }
##     vnames <- names(x)
##     if (length(vnames) != n) 
##         vnames <- character(n)
##     no.vn <- !nzchar(vnames)
##     vlist <- vnames <- as.list(vnames)
##     nrows <- ncols <- integer(n)
##     for (i in seq_len(n)) {
##         xi <- if (is.character(x[[i]]) || is.list(x[[i]])) 
##             as.data.frame(x[[i]], optional = TRUE, stringsAsFactors = stringsAsFactors)
##         else as.data.frame(x[[i]], optional = TRUE)
##         nrows[i] <- .row_names_info(xi)
##         ncols[i] <- length(xi)
##         namesi <- names(xi)
##         if (ncols[i] > 1L) {
##             if (length(namesi) == 0L) 
##                 namesi <- seq_len(ncols[i])
##             vnames[[i]] <- if (no.vn[i]) 
##                 namesi
##             else paste(vnames[[i]], namesi, sep = ".")
##         }
##         else if (length(namesi)) {
##             vnames[[i]] <- namesi
##         }
##         else if (fix.empty.names && no.vn[[i]]) {
##             tmpname <- deparse(object[[i]], nlines = 1L)[1L]
##             if (startsWith(tmpname, "I(") && endsWith(tmpname, 
##                 ")")) {
##                 ntmpn <- nchar(tmpname, "c")
##                 tmpname <- substr(tmpname, 3L, ntmpn - 1L)
##             }
##             vnames[[i]] <- tmpname
##         }
##         if (mirn && nrows[i] > 0L) {
##             rowsi <- attr(xi, "row.names")
##             if (any(nzchar(rowsi))) 
##                 row.names <- data.row.names(row.names, rowsi, 
##                   i)
##         }
##         nrows[i] <- abs(nrows[i])
##         vlist[[i]] <- xi
##     }
##     nr <- max(nrows)
##     for (i in seq_len(n)[nrows < nr]) {
##         xi <- vlist[[i]]
##         if (nrows[i] > 0L && (nr%%nrows[i] == 0L)) {
##             xi <- unclass(xi)
##             fixed <- TRUE
##             for (j in seq_along(xi)) {
##                 xi1 <- xi[[j]]
##                 if (is.vector(xi1) || is.factor(xi1)) 
##                   xi[[j]] <- rep(xi1, length.out = nr)
##                 else if (is.character(xi1) && inherits(xi1, "AsIs")) 
##                   xi[[j]] <- structure(rep(xi1, length.out = nr), 
##                     class = class(xi1))
##                 else if (inherits(xi1, "Date") || inherits(xi1, 
##                   "POSIXct")) 
##                   xi[[j]] <- rep(xi1, length.out = nr)
##                 else {
##                   fixed <- FALSE
##                   break
##                 }
##             }
##             if (fixed) {
##                 vlist[[i]] <- xi
##                 next
##             }
##         }
##         stop(gettextf("arguments imply differing number of rows: %s", 
##             paste(unique(nrows), collapse = ", ")), domain = NA)
##     }
##     value <- unlist(vlist, recursive = FALSE, use.names = FALSE)
##     vnames <- as.character(unlist(vnames[ncols > 0L]))
##     if (fix.empty.names && any(noname <- !nzchar(vnames))) 
##         vnames[noname] <- paste0("Var.", seq_along(vnames))[noname]
##     if (check.names) {
##         if (fix.empty.names) 
##             vnames <- make.names(vnames, unique = TRUE)
##         else {
##             nz <- nzchar(vnames)
##             vnames[nz] <- make.names(vnames[nz], unique = TRUE)
##         }
##     }
##     names(value) <- vnames
##     if (!mrn) {
##         if (length(row.names) == 1L && nr != 1L) {
##             if (is.character(row.names)) 
##                 row.names <- match(row.names, vnames, 0L)
##             if (length(row.names) != 1L || row.names < 1L || 
##                 row.names > length(vnames)) 
##                 stop("'row.names' should specify one of the variables")
##             i <- row.names
##             row.names <- value[[i]]
##             value <- value[-i]
##         }
##         else if (!is.null(row.names) && length(row.names) != 
##             nr) 
##             stop("row names supplied are of the wrong length")
##     }
##     else if (!is.null(row.names) && length(row.names) != nr) {
##         warning("row names were found from a short variable and have been discarded")
##         row.names <- NULL
##     }
##     class(value) <- "data.frame"
##     if (is.null(row.names)) 
##         attr(value, "row.names") <- .set_row_names(nr)
##     else {
##         if (is.object(row.names) || !is.integer(row.names)) 
##             row.names <- as.character(row.names)
##         if (anyNA(row.names)) 
##             stop("row names contain missing values")
##         if (anyDuplicated(row.names)) 
##             stop(gettextf("duplicate row.names: %s", paste(unique(row.names[duplicated(row.names)]), 
##                 collapse = ", ")), domain = NA)
##         row.names(value) <- row.names
##     }
##     value
## }
## <bytecode: 0x0000019d8104adf0>
## <environment: namespace:base>
mean_prpblck <- mean(discrim$prpblck, na.rm = TRUE)
sd_prpblck <- sd(discrim$prpblck, na.rm = TRUE)
mean_income <- mean(discrim$income, na.rm = TRUE) 
mean_income <- mean(discrim$income, na.rm = TRUE)
sd_income <- sd(discrim$income, na.rm = TRUE)

cat("Income Ortalama:", round(mean_income, 2), "\n")
## Income Ortalama: 47053.78
cat("Income Standart Sapma:", round(sd_income, 2), "\n")
## Income Standart Sapma: 13179.29
model <- lm(psoda ~ prpblck + income, data = discrim)
summary(model)
## 
## Call:
## lm(formula = psoda ~ prpblck + income, data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.29401 -0.05242  0.00333  0.04231  0.44322 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 9.563e-01  1.899e-02  50.354  < 2e-16 ***
## prpblck     1.150e-01  2.600e-02   4.423 1.26e-05 ***
## income      1.603e-06  3.618e-07   4.430 1.22e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08611 on 398 degrees of freedom
##   (9 observations effacées parce que manquantes)
## Multiple R-squared:  0.06422,    Adjusted R-squared:  0.05952 
## F-statistic: 13.66 on 2 and 398 DF,  p-value: 1.835e-06
model_summary <- summary(model)
beta_0 <- model_summary$coefficients[1, 1]  # Sabit terim
beta_1 <- model_summary$coefficients[2, 1]  # prpblck katsayısı
beta_2 <- model_summary$coefficients[3, 1]  # income katsayısı
r_squared <- model_summary$r.squared         # R-kare
n <- nrow(discrim)   
cat("Tahmin edilen denklem:\n")
## Tahmin edilen denklem:
cat("psoda =", round(beta_0, 3), "+", round(beta_1, 3), "* prpblck +", round(beta_2, 6), "* income\n")
## psoda = 0.956 + 0.115 * prpblck + 2e-06 * income
cat("R-kare:", round(r_squared, 3), "n:", n, "\n")
## R-kare: 0.064 n: 410
model_simple <- lm(psoda ~ prpblck, data = discrim)
summary(model_simple)
## 
## Call:
## lm(formula = psoda ~ prpblck, data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.30884 -0.05963  0.01135  0.03206  0.44840 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.03740    0.00519  199.87  < 2e-16 ***
## prpblck      0.06493    0.02396    2.71  0.00702 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0881 on 399 degrees of freedom
##   (9 observations effacées parce que manquantes)
## Multiple R-squared:  0.01808,    Adjusted R-squared:  0.01561 
## F-statistic: 7.345 on 1 and 399 DF,  p-value: 0.007015
model_full <- lm(psoda ~ prpblck + income, data = discrim)
summary(model_full)
## 
## Call:
## lm(formula = psoda ~ prpblck + income, data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.29401 -0.05242  0.00333  0.04231  0.44322 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 9.563e-01  1.899e-02  50.354  < 2e-16 ***
## prpblck     1.150e-01  2.600e-02   4.423 1.26e-05 ***
## income      1.603e-06  3.618e-07   4.430 1.22e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08611 on 398 degrees of freedom
##   (9 observations effacées parce que manquantes)
## Multiple R-squared:  0.06422,    Adjusted R-squared:  0.05952 
## F-statistic: 13.66 on 2 and 398 DF,  p-value: 1.835e-06
cat("Basit modelde prpblck katsayısı:", coef(model_simple)["prpblck"], "\n")
## Basit modelde prpblck katsayısı: 0.06492687
cat("Genişletilmiş modelde prpblck katsayısı:", coef(model_full)["prpblck"], "\n")
## Genişletilmiş modelde prpblck katsayısı: 0.1149882
model_log <- lm(log(psoda) ~ prpblck + log(income), data = discrim)
summary(model_log)
## 
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income), data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.33563 -0.04695  0.00658  0.04334  0.35413 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.79377    0.17943  -4.424 1.25e-05 ***
## prpblck      0.12158    0.02575   4.722 3.24e-06 ***
## log(income)  0.07651    0.01660   4.610 5.43e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0821 on 398 degrees of freedom
##   (9 observations effacées parce que manquantes)
## Multiple R-squared:  0.06809,    Adjusted R-squared:  0.06341 
## F-statistic: 14.54 on 2 and 398 DF,  p-value: 8.039e-07
beta_1 <- coef(model_log)["prpblck"]
percentage_change <- beta_1 * 20
cat("prpblck 20 yüzde puan artarsa, psoda'nın tahmini yuzde değişimi:", round(percentage_change, 2), "%\n")
## prpblck 20 yüzde puan artarsa, psoda'nın tahmini yuzde değişimi: 2.43 %
model_log_prppov <- lm(log(psoda) ~ prpblck + log(income) + prppov, data = discrim)
summary(model_log_prppov)
## 
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income) + prppov, data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.32218 -0.04648  0.00651  0.04272  0.35622 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.46333    0.29371  -4.982  9.4e-07 ***
## prpblck      0.07281    0.03068   2.373   0.0181 *  
## log(income)  0.13696    0.02676   5.119  4.8e-07 ***
## prppov       0.38036    0.13279   2.864   0.0044 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08137 on 397 degrees of freedom
##   (9 observations effacées parce que manquantes)
## Multiple R-squared:  0.08696,    Adjusted R-squared:  0.08006 
## F-statistic:  12.6 on 3 and 397 DF,  p-value: 6.917e-08
coef(model_log)["prpblck"]
##   prpblck 
## 0.1215803
coef(model_log_prppov)["prpblck"]
##    prpblck 
## 0.07280726
beta_1_before <- coef(model_log)["prpblck"]
beta_1_after <- coef(model_log_prppov)["prpblck"]
cat("Önceki modelde β₁ (prpblck):", round(beta_1_before, 4), "\n")
## Önceki modelde β₁ (prpblck): 0.1216
cat("Yeni modelde β₁ (prpblck):", round(beta_1_after, 4), "\n")
## Yeni modelde β₁ (prpblck): 0.0728
correlation <- cor(log(discrim$income), discrim$prppov, use = "complete.obs")
cat("log(income) ve prppov arasındaki korelasyon katsayısı:", round(correlation, 4), "\n")
## log(income) ve prppov arasındaki korelasyon katsayısı: -0.8385

#Aşağıdaki ifadeyi değerlendirin: “log(income) ve prppov çok yüksek oranda ilişkili olduğundan, aynı regresyonda olmalarına gerek yoktur.”

#Eğer korelasyon düşükse veya VIF değerleri makulse: #Sonuç: log(income) ve prppov farklı yönlerden bilgi taşıdığı için aynı modelde kullanılabilirler.

F testi

model <- lm(log(psoda) ~ prpblck + log(income) + prppov, data = discrim)
summary(model)
## 
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income) + prppov, data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.32218 -0.04648  0.00651  0.04272  0.35622 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.46333    0.29371  -4.982  9.4e-07 ***
## prpblck      0.07281    0.03068   2.373   0.0181 *  
## log(income)  0.13696    0.02676   5.119  4.8e-07 ***
## prppov       0.38036    0.13279   2.864   0.0044 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08137 on 397 degrees of freedom
##   (9 observations effacées parce que manquantes)
## Multiple R-squared:  0.08696,    Adjusted R-squared:  0.08006 
## F-statistic:  12.6 on 3 and 397 DF,  p-value: 6.917e-08
restricted_model <- lm(log(psoda) ~ 1, data = discrim)
full_model <- lm(log(psoda) ~ prpblck + log(income) + prppov, data = discrim)