library(wooldridge)
## Warning: le package 'wooldridge' a été compilé avec la version R 4.4.2
data("discrim")
str(discrim)
## 'data.frame': 410 obs. of 37 variables:
## $ psoda : num 1.12 1.06 1.06 1.12 1.12 ...
## $ pfries : num 1.06 0.91 0.91 1.02 NA ...
## $ pentree : num 1.02 0.95 0.98 1.06 0.49 ...
## $ wagest : num 4.25 4.75 4.25 5 5 ...
## $ nmgrs : num 3 3 3 4 3 4 3 3 4 3 ...
## $ nregs : int 5 3 5 5 3 4 2 5 4 5 ...
## $ hrsopen : num 16 16.5 18 16 16 15 16 17 17 18 ...
## $ emp : num 27.5 21.5 30 27.5 5 17.5 22.5 18.5 17 27 ...
## $ psoda2 : num 1.11 1.05 1.05 1.15 1.04 ...
## $ pfries2 : num 1.11 0.89 0.94 1.05 1.01 ...
## $ pentree2: num 1.05 0.95 0.98 1.05 0.58 ...
## $ wagest2 : num 5.05 5.05 5.05 5.05 5.05 ...
## $ nmgrs2 : num 5 4 4 4 3 3 3 3 4 6 ...
## $ nregs2 : int 5 3 5 5 3 4 2 5 4 5 ...
## $ hrsopen2: num 15 17.5 17.5 16 16 15 16 16 18 17 ...
## $ emp2 : num 27 24.5 25 NA 12 28 18.5 17 34 22 ...
## $ compown : int 1 0 0 0 0 0 0 1 0 1 ...
## $ chain : int 3 1 1 3 1 1 1 3 1 3 ...
## $ density : num 4030 4030 11400 8345 720 ...
## $ crmrte : num 0.0529 0.0529 0.036 0.0484 0.0616 ...
## $ state : int 1 1 1 1 1 1 1 1 1 1 ...
## $ prpblck : num 0.1712 0.1712 0.0474 0.0528 0.0345 ...
## $ prppov : num 0.0366 0.0366 0.0879 0.0591 0.0254 ...
## $ prpncar : num 0.0788 0.0788 0.2694 0.1367 0.0738 ...
## $ hseval : num 148300 148300 169200 171600 249100 ...
## $ nstores : int 3 3 3 3 1 2 1 1 5 5 ...
## $ income : num 44534 44534 41164 50366 72287 ...
## $ county : int 18 18 12 10 10 18 10 24 10 10 ...
## $ lpsoda : num 0.1133 0.0583 0.0583 0.1133 0.1133 ...
## $ lpfries : num 0.0583 -0.0943 -0.0943 0.0198 NA ...
## $ lhseval : num 11.9 11.9 12 12.1 12.4 ...
## $ lincome : num 10.7 10.7 10.6 10.8 11.2 ...
## $ ldensity: num 8.3 8.3 9.34 9.03 6.58 ...
## $ NJ : int 1 1 1 1 1 1 1 1 1 1 ...
## $ BK : int 0 1 1 0 1 1 1 0 1 0 ...
## $ KFC : int 0 0 0 0 0 0 0 0 0 0 ...
## $ RR : int 1 0 0 1 0 0 0 1 0 1 ...
## - attr(*, "time.stamp")= chr "25 Jun 2011 23:03"
library(wooldridge)
data("discrim")
head(discrim)
## psoda pfries pentree wagest nmgrs nregs hrsopen emp psoda2 pfries2 pentree2
## 1 1.12 1.06 1.02 4.25 3 5 16.0 27.5 1.11 1.11 1.05
## 2 1.06 0.91 0.95 4.75 3 3 16.5 21.5 1.05 0.89 0.95
## 3 1.06 0.91 0.98 4.25 3 5 18.0 30.0 1.05 0.94 0.98
## 4 1.12 1.02 1.06 5.00 4 5 16.0 27.5 1.15 1.05 1.05
## 5 1.12 NA 0.49 5.00 3 3 16.0 5.0 1.04 1.01 0.58
## 6 1.06 0.95 1.01 4.25 4 4 15.0 17.5 1.05 0.94 1.00
## wagest2 nmgrs2 nregs2 hrsopen2 emp2 compown chain density crmrte state
## 1 5.05 5 5 15.0 27.0 1 3 4030 0.0528866 1
## 2 5.05 4 3 17.5 24.5 0 1 4030 0.0528866 1
## 3 5.05 4 5 17.5 25.0 0 1 11400 0.0360003 1
## 4 5.05 4 5 16.0 NA 0 3 8345 0.0484232 1
## 5 5.05 3 3 16.0 12.0 0 1 720 0.0615890 1
## 6 5.05 3 4 15.0 28.0 0 1 4424 0.0334823 1
## prpblck prppov prpncar hseval nstores income county lpsoda
## 1 0.1711542 0.0365789 0.0788428 148300 3 44534 18 0.11332869
## 2 0.1711542 0.0365789 0.0788428 148300 3 44534 18 0.05826885
## 3 0.0473602 0.0879072 0.2694298 169200 3 41164 12 0.05826885
## 4 0.0528394 0.0591227 0.1366903 171600 3 50366 10 0.11332869
## 5 0.0344800 0.0254145 0.0738020 249100 1 72287 10 0.11332869
## 6 0.0591327 0.0835001 0.1151341 148000 2 44515 18 0.05826885
## lpfries lhseval lincome ldensity NJ BK KFC RR
## 1 0.05826885 11.90699 10.70401 8.301521 1 0 0 1
## 2 -0.09431065 11.90699 10.70401 8.301521 1 1 0 0
## 3 -0.09431065 12.03884 10.62532 9.341369 1 1 0 0
## 4 0.01980261 12.05292 10.82707 9.029418 1 0 0 1
## 5 NA 12.42561 11.18840 6.579251 1 1 0 0
## 6 -0.05129331 11.90497 10.70358 8.394799 1 1 0 0
model <- lm(psoda ~ prpblck + income, data = discrim)
summary(model)
##
## Call:
## lm(formula = psoda ~ prpblck + income, data = discrim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.29401 -0.05242 0.00333 0.04231 0.44322
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.563e-01 1.899e-02 50.354 < 2e-16 ***
## prpblck 1.150e-01 2.600e-02 4.423 1.26e-05 ***
## income 1.603e-06 3.618e-07 4.430 1.22e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08611 on 398 degrees of freedom
## (9 observations effacées parce que manquantes)
## Multiple R-squared: 0.06422, Adjusted R-squared: 0.05952
## F-statistic: 13.66 on 2 and 398 DF, p-value: 1.835e-06
write.csv(discrim, "discrim_data.csv", row.names = FALSE)
Variable = c("psoda", "prpblck", "income", "prppov")
Description = c(
"Fast-food restoranlarındaki soda fiyatı",
"Bölgede yaşayan siyah nüfusun oranı",
"Bölgenin ortalama geliri (dolar cinsinden)",
"Yoksulluk sınırının altında yaşayan nüfusun oranı")
variable_descriptions <- data.frame
print(variable_descriptions)
## function (..., row.names = NULL, check.rows = FALSE, check.names = TRUE,
## fix.empty.names = TRUE, stringsAsFactors = FALSE)
## {
## data.row.names <- if (check.rows && is.null(row.names))
## function(current, new, i) {
## if (is.character(current))
## new <- as.character(new)
## if (is.character(new))
## current <- as.character(current)
## if (anyDuplicated(new))
## return(current)
## if (is.null(current))
## return(new)
## if (all(current == new) || all(current == ""))
## return(new)
## stop(gettextf("mismatch of row names in arguments of 'data.frame', item %d",
## i), domain = NA)
## }
## else function(current, new, i) {
## current %||% if (anyDuplicated(new)) {
## warning(gettextf("some row.names duplicated: %s --> row.names NOT used",
## paste(which(duplicated(new)), collapse = ",")),
## domain = NA)
## current
## }
## else new
## }
## object <- as.list(substitute(list(...)))[-1L]
## mirn <- missing(row.names)
## mrn <- is.null(row.names)
## x <- list(...)
## n <- length(x)
## if (n < 1L) {
## if (!mrn) {
## if (is.object(row.names) || !is.integer(row.names))
## row.names <- as.character(row.names)
## if (anyNA(row.names))
## stop("row names contain missing values")
## if (anyDuplicated(row.names))
## stop(gettextf("duplicate row.names: %s", paste(unique(row.names[duplicated(row.names)]),
## collapse = ", ")), domain = NA)
## }
## else row.names <- integer()
## return(structure(list(), names = character(), row.names = row.names,
## class = "data.frame"))
## }
## vnames <- names(x)
## if (length(vnames) != n)
## vnames <- character(n)
## no.vn <- !nzchar(vnames)
## vlist <- vnames <- as.list(vnames)
## nrows <- ncols <- integer(n)
## for (i in seq_len(n)) {
## xi <- if (is.character(x[[i]]) || is.list(x[[i]]))
## as.data.frame(x[[i]], optional = TRUE, stringsAsFactors = stringsAsFactors)
## else as.data.frame(x[[i]], optional = TRUE)
## nrows[i] <- .row_names_info(xi)
## ncols[i] <- length(xi)
## namesi <- names(xi)
## if (ncols[i] > 1L) {
## if (length(namesi) == 0L)
## namesi <- seq_len(ncols[i])
## vnames[[i]] <- if (no.vn[i])
## namesi
## else paste(vnames[[i]], namesi, sep = ".")
## }
## else if (length(namesi)) {
## vnames[[i]] <- namesi
## }
## else if (fix.empty.names && no.vn[[i]]) {
## tmpname <- deparse(object[[i]], nlines = 1L)[1L]
## if (startsWith(tmpname, "I(") && endsWith(tmpname,
## ")")) {
## ntmpn <- nchar(tmpname, "c")
## tmpname <- substr(tmpname, 3L, ntmpn - 1L)
## }
## vnames[[i]] <- tmpname
## }
## if (mirn && nrows[i] > 0L) {
## rowsi <- attr(xi, "row.names")
## if (any(nzchar(rowsi)))
## row.names <- data.row.names(row.names, rowsi,
## i)
## }
## nrows[i] <- abs(nrows[i])
## vlist[[i]] <- xi
## }
## nr <- max(nrows)
## for (i in seq_len(n)[nrows < nr]) {
## xi <- vlist[[i]]
## if (nrows[i] > 0L && (nr%%nrows[i] == 0L)) {
## xi <- unclass(xi)
## fixed <- TRUE
## for (j in seq_along(xi)) {
## xi1 <- xi[[j]]
## if (is.vector(xi1) || is.factor(xi1))
## xi[[j]] <- rep(xi1, length.out = nr)
## else if (is.character(xi1) && inherits(xi1, "AsIs"))
## xi[[j]] <- structure(rep(xi1, length.out = nr),
## class = class(xi1))
## else if (inherits(xi1, "Date") || inherits(xi1,
## "POSIXct"))
## xi[[j]] <- rep(xi1, length.out = nr)
## else {
## fixed <- FALSE
## break
## }
## }
## if (fixed) {
## vlist[[i]] <- xi
## next
## }
## }
## stop(gettextf("arguments imply differing number of rows: %s",
## paste(unique(nrows), collapse = ", ")), domain = NA)
## }
## value <- unlist(vlist, recursive = FALSE, use.names = FALSE)
## vnames <- as.character(unlist(vnames[ncols > 0L]))
## if (fix.empty.names && any(noname <- !nzchar(vnames)))
## vnames[noname] <- paste0("Var.", seq_along(vnames))[noname]
## if (check.names) {
## if (fix.empty.names)
## vnames <- make.names(vnames, unique = TRUE)
## else {
## nz <- nzchar(vnames)
## vnames[nz] <- make.names(vnames[nz], unique = TRUE)
## }
## }
## names(value) <- vnames
## if (!mrn) {
## if (length(row.names) == 1L && nr != 1L) {
## if (is.character(row.names))
## row.names <- match(row.names, vnames, 0L)
## if (length(row.names) != 1L || row.names < 1L ||
## row.names > length(vnames))
## stop("'row.names' should specify one of the variables")
## i <- row.names
## row.names <- value[[i]]
## value <- value[-i]
## }
## else if (!is.null(row.names) && length(row.names) !=
## nr)
## stop("row names supplied are of the wrong length")
## }
## else if (!is.null(row.names) && length(row.names) != nr) {
## warning("row names were found from a short variable and have been discarded")
## row.names <- NULL
## }
## class(value) <- "data.frame"
## if (is.null(row.names))
## attr(value, "row.names") <- .set_row_names(nr)
## else {
## if (is.object(row.names) || !is.integer(row.names))
## row.names <- as.character(row.names)
## if (anyNA(row.names))
## stop("row names contain missing values")
## if (anyDuplicated(row.names))
## stop(gettextf("duplicate row.names: %s", paste(unique(row.names[duplicated(row.names)]),
## collapse = ", ")), domain = NA)
## row.names(value) <- row.names
## }
## value
## }
## <bytecode: 0x0000019d8104adf0>
## <environment: namespace:base>
mean_prpblck <- mean(discrim$prpblck, na.rm = TRUE)
sd_prpblck <- sd(discrim$prpblck, na.rm = TRUE)
mean_income <- mean(discrim$income, na.rm = TRUE)
mean_income <- mean(discrim$income, na.rm = TRUE)
sd_income <- sd(discrim$income, na.rm = TRUE)
cat("Income Ortalama:", round(mean_income, 2), "\n")
## Income Ortalama: 47053.78
cat("Income Standart Sapma:", round(sd_income, 2), "\n")
## Income Standart Sapma: 13179.29
model <- lm(psoda ~ prpblck + income, data = discrim)
summary(model)
##
## Call:
## lm(formula = psoda ~ prpblck + income, data = discrim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.29401 -0.05242 0.00333 0.04231 0.44322
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.563e-01 1.899e-02 50.354 < 2e-16 ***
## prpblck 1.150e-01 2.600e-02 4.423 1.26e-05 ***
## income 1.603e-06 3.618e-07 4.430 1.22e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08611 on 398 degrees of freedom
## (9 observations effacées parce que manquantes)
## Multiple R-squared: 0.06422, Adjusted R-squared: 0.05952
## F-statistic: 13.66 on 2 and 398 DF, p-value: 1.835e-06
model_summary <- summary(model)
beta_0 <- model_summary$coefficients[1, 1] # Sabit terim
beta_1 <- model_summary$coefficients[2, 1] # prpblck katsayısı
beta_2 <- model_summary$coefficients[3, 1] # income katsayısı
r_squared <- model_summary$r.squared # R-kare
n <- nrow(discrim)
cat("Tahmin edilen denklem:\n")
## Tahmin edilen denklem:
cat("psoda =", round(beta_0, 3), "+", round(beta_1, 3), "* prpblck +", round(beta_2, 6), "* income\n")
## psoda = 0.956 + 0.115 * prpblck + 2e-06 * income
cat("R-kare:", round(r_squared, 3), "n:", n, "\n")
## R-kare: 0.064 n: 410
model_simple <- lm(psoda ~ prpblck, data = discrim)
summary(model_simple)
##
## Call:
## lm(formula = psoda ~ prpblck, data = discrim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.30884 -0.05963 0.01135 0.03206 0.44840
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.03740 0.00519 199.87 < 2e-16 ***
## prpblck 0.06493 0.02396 2.71 0.00702 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0881 on 399 degrees of freedom
## (9 observations effacées parce que manquantes)
## Multiple R-squared: 0.01808, Adjusted R-squared: 0.01561
## F-statistic: 7.345 on 1 and 399 DF, p-value: 0.007015
model_full <- lm(psoda ~ prpblck + income, data = discrim)
summary(model_full)
##
## Call:
## lm(formula = psoda ~ prpblck + income, data = discrim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.29401 -0.05242 0.00333 0.04231 0.44322
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.563e-01 1.899e-02 50.354 < 2e-16 ***
## prpblck 1.150e-01 2.600e-02 4.423 1.26e-05 ***
## income 1.603e-06 3.618e-07 4.430 1.22e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08611 on 398 degrees of freedom
## (9 observations effacées parce que manquantes)
## Multiple R-squared: 0.06422, Adjusted R-squared: 0.05952
## F-statistic: 13.66 on 2 and 398 DF, p-value: 1.835e-06
cat("Basit modelde prpblck katsayısı:", coef(model_simple)["prpblck"], "\n")
## Basit modelde prpblck katsayısı: 0.06492687
cat("Genişletilmiş modelde prpblck katsayısı:", coef(model_full)["prpblck"], "\n")
## Genişletilmiş modelde prpblck katsayısı: 0.1149882
model_log <- lm(log(psoda) ~ prpblck + log(income), data = discrim)
summary(model_log)
##
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income), data = discrim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.33563 -0.04695 0.00658 0.04334 0.35413
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.79377 0.17943 -4.424 1.25e-05 ***
## prpblck 0.12158 0.02575 4.722 3.24e-06 ***
## log(income) 0.07651 0.01660 4.610 5.43e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0821 on 398 degrees of freedom
## (9 observations effacées parce que manquantes)
## Multiple R-squared: 0.06809, Adjusted R-squared: 0.06341
## F-statistic: 14.54 on 2 and 398 DF, p-value: 8.039e-07
beta_1 <- coef(model_log)["prpblck"]
percentage_change <- beta_1 * 20
cat("prpblck 20 yüzde puan artarsa, psoda'nın tahmini yuzde değişimi:", round(percentage_change, 2), "%\n")
## prpblck 20 yüzde puan artarsa, psoda'nın tahmini yuzde değişimi: 2.43 %
model_log_prppov <- lm(log(psoda) ~ prpblck + log(income) + prppov, data = discrim)
summary(model_log_prppov)
##
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income) + prppov, data = discrim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.32218 -0.04648 0.00651 0.04272 0.35622
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.46333 0.29371 -4.982 9.4e-07 ***
## prpblck 0.07281 0.03068 2.373 0.0181 *
## log(income) 0.13696 0.02676 5.119 4.8e-07 ***
## prppov 0.38036 0.13279 2.864 0.0044 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08137 on 397 degrees of freedom
## (9 observations effacées parce que manquantes)
## Multiple R-squared: 0.08696, Adjusted R-squared: 0.08006
## F-statistic: 12.6 on 3 and 397 DF, p-value: 6.917e-08
coef(model_log)["prpblck"]
## prpblck
## 0.1215803
coef(model_log_prppov)["prpblck"]
## prpblck
## 0.07280726
beta_1_before <- coef(model_log)["prpblck"]
beta_1_after <- coef(model_log_prppov)["prpblck"]
cat("Önceki modelde β₁ (prpblck):", round(beta_1_before, 4), "\n")
## Önceki modelde β₁ (prpblck): 0.1216
cat("Yeni modelde β₁ (prpblck):", round(beta_1_after, 4), "\n")
## Yeni modelde β₁ (prpblck): 0.0728
correlation <- cor(log(discrim$income), discrim$prppov, use = "complete.obs")
cat("log(income) ve prppov arasındaki korelasyon katsayısı:", round(correlation, 4), "\n")
## log(income) ve prppov arasındaki korelasyon katsayısı: -0.8385
#Aşağıdaki ifadeyi değerlendirin: “log(income) ve prppov çok yüksek oranda ilişkili olduğundan, aynı regresyonda olmalarına gerek yoktur.”
#Eğer korelasyon düşükse veya VIF değerleri makulse: #Sonuç: log(income) ve prppov farklı yönlerden bilgi taşıdığı için aynı modelde kullanılabilirler.
F testi
model <- lm(log(psoda) ~ prpblck + log(income) + prppov, data = discrim)
summary(model)
##
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income) + prppov, data = discrim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.32218 -0.04648 0.00651 0.04272 0.35622
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.46333 0.29371 -4.982 9.4e-07 ***
## prpblck 0.07281 0.03068 2.373 0.0181 *
## log(income) 0.13696 0.02676 5.119 4.8e-07 ***
## prppov 0.38036 0.13279 2.864 0.0044 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08137 on 397 degrees of freedom
## (9 observations effacées parce que manquantes)
## Multiple R-squared: 0.08696, Adjusted R-squared: 0.08006
## F-statistic: 12.6 on 3 and 397 DF, p-value: 6.917e-08
restricted_model <- lm(log(psoda) ~ 1, data = discrim)
full_model <- lm(log(psoda) ~ prpblck + log(income) + prppov, data = discrim)