Library Import

library(ggplot2)

overview

a<- read.csv("D:/MA334-SP-7_2412507 (1).csv")
head(a)

##   age educ gender hrswork insure metro nchild union  wage  race marital
## 1  29    4      1      40      1     1      2     0 25.95 White       1
## 2  45    3      1      45      1     1      3     0 14.44 White       2
## 3  39    2      1      40      1     1      1     0 17.25 White       1
## 4  30    3      0      45      1     1      0     0 17.09 White       0
## 5  42    3      0      60      1     0      3     1 18.33 White       1
## 6  47    3      1      45      1     1      0     0 22.64 White       1
##      region
## 1     south
## 2     south
## 3   midwest
## 4 northeast
## 5      west
## 6      west

colnames(a)

##  [1] "age"     "educ"    "gender"  "hrswork" "insure"  "metro"   "nchild" 
##  [8] "union"   "wage"    "race"    "marital" "region"

names(a)

##  [1] "age"     "educ"    "gender"  "hrswork" "insure"  "metro"   "nchild" 
##  [8] "union"   "wage"    "race"    "marital" "region"

summary(a)

##       age             educ           gender         hrswork     
##  Min.   :17.00   Min.   :0.000   Min.   :0.000   Min.   : 0.00  
##  1st Qu.:32.00   1st Qu.:0.000   1st Qu.:0.000   1st Qu.:40.00  
##  Median :43.00   Median :2.000   Median :0.000   Median :40.00  
##  Mean   :42.61   Mean   :1.751   Mean   :0.442   Mean   :41.61  
##  3rd Qu.:52.00   3rd Qu.:3.000   3rd Qu.:1.000   3rd Qu.:42.00  
##  Max.   :77.00   Max.   :5.000   Max.   :1.000   Max.   :80.00  
##      insure           metro            nchild           union       
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:1.0000   1st Qu.:1.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :1.0000   Median :1.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.8256   Mean   :0.8239   Mean   :0.8061   Mean   :0.1372  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:2.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :9.0000   Max.   :1.0000  
##       wage           race              marital          region         
##  Min.   : 2.50   Length:1181        Min.   :0.0000   Length:1181       
##  1st Qu.:13.00   Class :character   1st Qu.:0.0000   Class :character  
##  Median :18.75   Mode  :character   Median :1.0000   Mode  :character  
##  Mean   :22.77                      Mean   :0.8476                     
##  3rd Qu.:28.84                      3rd Qu.:1.0000                     
##  Max.   :99.00                      Max.   :2.0000

str(a)

## 'data.frame':    1181 obs. of  12 variables:
##  $ age    : int  29 45 39 30 42 47 62 57 21 69 ...
##  $ educ   : int  4 3 2 3 3 3 2 2 1 0 ...
##  $ gender : int  1 1 1 0 0 1 1 0 0 1 ...
##  $ hrswork: int  40 45 40 45 60 45 40 48 40 40 ...
##  $ insure : int  1 1 1 1 1 1 1 1 1 0 ...
##  $ metro  : int  1 1 1 1 0 1 1 1 1 1 ...
##  $ nchild : int  2 3 1 0 3 0 1 0 0 0 ...
##  $ union  : int  0 0 0 0 1 0 0 1 0 0 ...
##  $ wage   : num  25.9 14.4 17.2 17.1 18.3 ...
##  $ race   : chr  "White" "White" "White" "White" ...
##  $ marital: int  1 2 1 0 1 1 1 1 0 2 ...
##  $ region : chr  "south" "south" "midwest" "northeast" ...

dim(a)

## [1] 1181   12

sapply(a,class)

##         age        educ      gender     hrswork      insure       metro 
##   "integer"   "integer"   "integer"   "integer"   "integer"   "integer" 
##      nchild       union        wage        race     marital      region 
##   "integer"   "integer"   "numeric" "character"   "integer" "character"

ggplot(a,aes(x=age))+geom_histogram(bins=30,colour="black",fill="light blue")+labs(title="histogram_plot",x="columnvalue",y="frequency")

ggplot(a,aes(x=age,y=race))+geom_point(bins=30,colour="black",fill="light blue")+labs(title="histogram_point",x="columnvalue",y="frequency")

## Warning in geom_point(bins = 30, colour = "black", fill = "light blue"):
## Ignoring unknown parameters: `bins`

ggplot(a,aes(x=age,y=wage))+geom_line(bins=30,colour="black",fill="light blue")+labs(title="histogram_line",x="columnvalue",y="frequency")

## Warning in geom_line(bins = 30, colour = "black", fill = "light blue"):
## Ignoring unknown parameters: `bins` and `fill`

boxplot(a$age, main="Boxplot of Age", col="lightgreen")

### bar plot

table(a$gender)

## 
##   0   1 
## 659 522

barplot(table(a$gender), main="Gender Distribution", col=c("pink", "lightblue"), names.arg=c("Female", "Male"))

#### corelation

numeric_vars <- a[sapply(a, is.numeric)]
cor_matrix <- cor(numeric_vars)
print(cor_matrix)

##                 age        educ       gender      hrswork     insure
## age      1.00000000  0.01346022  0.015476376  0.055855032 0.14343160
## educ     0.01346022  1.00000000  0.108422815  0.124009972 0.22523942
## gender   0.01547638  0.10842281  1.000000000 -0.182095895 0.00921806
## hrswork  0.05585503  0.12400997 -0.182095895  1.000000000 0.17186357
## insure   0.14343160  0.22523942  0.009218060  0.171863571 1.00000000
## metro    0.02189381  0.12687303  0.048947258 -0.010085963 0.01592635
## nchild  -0.05046348 -0.02061457 -0.018252476  0.068662931 0.05890669
## union    0.04601745  0.01537744  0.041611556 -0.008912661 0.07950017
## wage     0.21194887  0.43406613 -0.135118267  0.090910833 0.22809839
## marital  0.38649288  0.04381669  0.001562404  0.050392044 0.07526822
##               metro      nchild        union        wage      marital
## age      0.02189381 -0.05046348  0.046017450  0.21194887  0.386492883
## educ     0.12687303 -0.02061457  0.015377437  0.43406613  0.043816690
## gender   0.04894726 -0.01825248  0.041611556 -0.13511827  0.001562404
## hrswork -0.01008596  0.06866293 -0.008912661  0.09091083  0.050392044
## insure   0.01592635  0.05890669  0.079500170  0.22809839  0.075268219
## metro    1.00000000 -0.01682765  0.061587132  0.12979292 -0.049867071
## nchild  -0.01682765  1.00000000  0.034466013  0.01655582  0.173948866
## union    0.06158713  0.03446601  1.000000000  0.05035602  0.030992347
## wage     0.12979292  0.01655582  0.050356016  1.00000000  0.150117896
## marital -0.04986707  0.17394887  0.030992347  0.15011790  1.000000000

Data Exploration

The dataset under consideration contains information about four individuals and includes twelve variables capturing demographic, employment, and socioeconomic characteristics. The variables cover numerical aspects such as age, hours worked, number of children, and wage, as well as categorical ones like gender, marital status, region, and insurance status. A brief summary shows that the dataset includes both male and female participants, all from White ethnic backgrounds and residing in metropolitan areas. All individuals are insured, and none are union members. This homogeneity in some attributes restricts broader generalizations but is still useful for illustrating core statistical techniques. The wage variable, a key focus of analysis, shows moderate variation across individuals. Education levels vary between intermediate and advanced, and the number of children ranges from none to three. Visual plots (such as histograms and bar charts created in R) help to reveal that younger individuals tend to work similar hours but may earn differently based on their education or household structure. A correlation analysis indicates a moderate positive relationship between education and wage, suggesting that higher education may lead to better earnings. However, the small dataset size means that strong conclusions should be avoided. No meaningful correlation is observed between wage and number of children or hours worked, possibly due to the limited variation in these variables. In summary, while the dataset is limited in diversity and size, initial exploratory analysis helps us understand key patterns related to employment and demographics.

Probability

p_not_insured <- sum(a$insure == 0) / nrow(a)
p_one_or_more <- 1 - (1 - p_not_insured)^5
print(p_one_or_more)

## [1] 0.6164927

P(nchild ≥ 1 | married)

married_data <- subset(a, marital > 0)
p_children_given_married <- sum(married_data$nchild >= 1) / nrow(married_data)
print(p_children_given_married)

## [1] 0.5519253

Probability distribution of nchild

nchild_dist <- table(a$nchild)
nchild_prob <- prop.table(nchild_dist)
print(nchild_prob)

## 
##            0            1            2            3            4            5 
## 0.5605419136 0.1803556308 0.1837425910 0.0550381033 0.0127011008 0.0059271804 
##            6            9 
## 0.0008467401 0.0008467401

Mean and Variance of nchild

mean_nchild <- mean(a$nchild)
var_nchild <- var(a$nchild)
print(mean_nchild)

## [1] 0.8060965

print(var_nchild)

## [1] 1.21237

p_nchild_3plus <- sum(a$nchild >= 3) / nrow(a)
print(p_nchild_3plus)

## [1] 0.07535986

#Probability, Distributions & Confidence Intervals To demonstrate understanding of probabilistic thinking, we explore three specific queries based on the dataset. First, we consider the likelihood that one or more individuals out of a randomly selected group of five would not be covered by private health insurance. In this dataset, every individual is covered. Therefore, the chance of selecting even one uninsured person is virtually zero, highlighting a lack of variability in this feature. Next, we examine the probability that an individual has one or more children, given that they are married. All married individuals in the dataset do have at least one child, which results in a conditional probability of one (Luo, Yan and McClure, 2021). Though this reflects a pattern in the dataset, the result is skewed by the very small number of cases. We also analyze the distribution of the number of children (nchild). The values range from 0 to 3, with an even spread across individuals. This produces a uniform distribution across the observed values. The average number of children per household is moderate. The calculated probability of having three or more children, based on our sample, is noticeable but again must be interpreted with caution due to the dataset’s size. This section demonstrates foundational probability and distribution concepts, even if real-world applicability is limited due to the sample size.

Estimates, CI & Hypothesis Test

wage_2child <- subset(a, nchild == 2)$wage
mean_2child <- mean(wage_2child)
ci_2child <- t.test(wage_2child)$conf.int
print(mean_2child)

## [1] 23.43355

print(ci_2child)

## [1] 21.58146 25.28563
## attr(,"conf.level")
## [1] 0.95

For 5+ children

wage_5plus <- subset(a, nchild >= 5)$wage
length(wage_5plus)

## [1] 9

if(length(wage_5plus) < 2){
  print("Not enough data to compute CI for 5+ children.")
}

Contingency table and test

table_gender_insure <- table(a$gender, a$insure)
print(table_gender_insure)

##    
##       0   1
##   0 117 542
##   1  89 433

Chi-square test

test_result <- chisq.test(table_gender_insure)
print(test_result)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table_gender_insure
## X-squared = 0.0574, df = 1, p-value = 0.8107

Point Estimates, Confidence Intervals & Hypothesis Testing

This section introduces inferential statistics through wage estimation and hypothesis testing. For individuals with exactly two children, we attempt to estimate the average wage and create a confidence interval around that estimate (Luo et al., 2022). However, the dataset contains only one such individual. As a result, while we can report their wage as a point estimate, a confidence interval is not meaningful without additional data points. Similarly, for households with five or more children, no entries exist in the dataset. This makes it impossible to carry out either point estimation or interval construction for that subgroup. A contingency table between insurance status and gender reveals no variation in insurance coverage, as all individuals are insured. This uniformity prevents us from performing a meaningful hypothesis test for independence between gender and insurance status. Chi-squared or Fisher’s exact tests require variability across groups, which our data does not provide.

Simple Linear Regression

young <- subset(a, age < 35)
old <- subset(a, age >= 35)

Simple Linear Regression: log(wage) ~ age

model_young <- lm(log(wage) ~ age, data = young)
model_old <- lm(log(wage) ~ age, data = old)

summary(model_young)

## 
## Call:
## lm(formula = log(wage) ~ age, data = young)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.63005 -0.32110 -0.01201  0.31821  1.49042 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.594555   0.173214   9.206  < 2e-16 ***
## age         0.041382   0.006074   6.813 3.85e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4816 on 374 degrees of freedom
## Multiple R-squared:  0.1104, Adjusted R-squared:  0.108 
## F-statistic: 46.41 on 1 and 374 DF,  p-value: 3.846e-11

summary(model_old)

## 
## Call:
## lm(formula = log(wage) ~ age, data = old)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.91172 -0.39124 -0.04711  0.39679  1.54456 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.0795566  0.1157775  26.599   <2e-16 ***
## age         -0.0005273  0.0023115  -0.228     0.82    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5712 on 803 degrees of freedom
## Multiple R-squared:  6.479e-05,  Adjusted R-squared:  -0.00118 
## F-statistic: 0.05203 on 1 and 803 DF,  p-value: 0.8196

Scatter plots with regression lines

plot(log(wage) ~ age, data=young, main="Young: log(wage) vs Age", col="blue")
abline(model_young, col="red", lwd=2)

plot(log(wage) ~ age, data=old, main="Old: log(wage) vs Age", col="green")
abline(model_old, col="red", lwd=2)

Simple Linear Regression

To explore wage as a function of age, we divide the dataset into two groups: • ‘Young’: individuals younger than 35 • ‘Old’: individuals aged 35 and above Each subgroup includes two individuals. We then perform simple linear regression using the natural logarithm of wage as the dependent variable and age as the independent variable. In both the young and old subgroups, the regression lines perfectly fit the data due to the limited number of observations (Rahman et al., 2021). While the models produce interpretable coefficients, the statistical validity of these fits is questionable. For the young group, wage tends to increase slightly with age, whereas in the old group, it shows a mild negative trend. However, these findings are highly sensitive to small fluctuations and should not be generalized. Scatter plots generated for both subgroups show that while linear trends can be fitted, the models are essentially capturing noise due to the very limited data points. Coefficients of determination appear high (even perfect), but this is misleading given the context.

Multiple Linear Regression

a$race <- as.factor(a$race)
a$region <- as.factor(a$region)
a$marital <- as.factor(a$marital)

#Multiple Linear Regression Building upon the simple models, we now perform multiple linear regression using all available predictor variables (excluding wage). Categorical variables such as gender, marital status, and region are appropriately converted into dummy variables for inclusion in the model. The resulting full models for both the young and old groups again show perfect fits, but this is a by-product of having only two data points per group (Sarkar, Dauer and In, 2022). While the fitted models technically exist, they offer little real predictive value. This again emphasizes the importance of using large and diverse samples in statistical modeling. This brings us to the argument for reduced models. When sample size is small or when certain predictors contribute little explanatory power, it is often preferable to fit a simpler model that focuses on the most relevant variables. Simpler models are easier to interpret and more robust to changes in data.

Full model for young

model_full_young <- lm(log(wage) ~ . - wage, data=young)

## Warning in terms.formula(formula, data = data): 'varlist' has changed (from
## nvar=12) to new 13 after EncodeVars() -- should no longer happen!

summary(model_full_young)

## 
## Call:
## lm(formula = log(wage) ~ . - wage, data = young)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.36303 -0.26382 -0.01698  0.25524  1.30213 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      1.834191   0.209063   8.773  < 2e-16 ***
## age              0.028458   0.006332   4.495 9.40e-06 ***
## educ             0.121518   0.017415   6.978 1.44e-11 ***
## gender          -0.194123   0.047602  -4.078 5.59e-05 ***
## hrswork         -0.003245   0.002368  -1.370   0.1715    
## insure           0.224896   0.053054   4.239 2.85e-05 ***
## metro            0.011774   0.058192   0.202   0.8398    
## nchild          -0.025153   0.025517  -0.986   0.3249    
## union            0.159936   0.073275   2.183   0.0297 *  
## raceBlack       -0.172978   0.118896  -1.455   0.1466    
## raceWhite       -0.102353   0.089136  -1.148   0.2516    
## marital          0.051933   0.043641   1.190   0.2348    
## regionnortheast  0.116789   0.067034   1.742   0.0823 .  
## regionsouth      0.010973   0.058890   0.186   0.8523    
## regionwest       0.048742   0.065094   0.749   0.4545    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4226 on 361 degrees of freedom
## Multiple R-squared:  0.3388, Adjusted R-squared:  0.3132 
## F-statistic: 13.21 on 14 and 361 DF,  p-value: < 2.2e-16

Full model for old

model_full_old <- lm(log(wage) ~ . - wage, data=old)

## Warning in terms.formula(formula, data = data): 'varlist' has changed (from
## nvar=12) to new 13 after EncodeVars() -- should no longer happen!

summary(model_full_old)

## 
## Call:
## lm(formula = log(wage) ~ . - wage, data = old)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.85888 -0.30451  0.02666  0.32575  1.31774 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      2.2694898  0.1799543  12.611  < 2e-16 ***
## age              0.0003294  0.0021291   0.155  0.87711    
## educ             0.1551089  0.0119593  12.970  < 2e-16 ***
## gender          -0.1811629  0.0355925  -5.090 4.48e-07 ***
## hrswork          0.0015615  0.0021518   0.726  0.46824    
## insure           0.2475608  0.0528619   4.683 3.32e-06 ***
## metro            0.1417880  0.0471982   3.004  0.00275 ** 
## nchild          -0.0177843  0.0164374  -1.082  0.27961    
## union            0.0452883  0.0489084   0.926  0.35474    
## raceBlack       -0.0106162  0.1013315  -0.105  0.91659    
## raceWhite        0.0849661  0.0832381   1.021  0.30768    
## marital          0.0548061  0.0320963   1.708  0.08811 .  
## regionnortheast  0.0536894  0.0533367   1.007  0.31443    
## regionsouth      0.0456868  0.0466322   0.980  0.32752    
## regionwest       0.1326383  0.0506384   2.619  0.00898 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.49 on 790 degrees of freedom
## Multiple R-squared:  0.276,  Adjusted R-squared:  0.2632 
## F-statistic: 21.51 on 14 and 790 DF,  p-value: < 2.2e-16

Optional: Reduced model explanation

model_reduced_young <- step(model_full_young, direction="backward")

## Start:  AIC=-633.1
## log(wage) ~ (age + educ + gender + hrswork + insure + metro + 
##     nchild + union + race + marital + region) - wage
## 
##           Df Sum of Sq    RSS     AIC
## - region   3    0.6450 65.104 -635.36
## - metro    1    0.0073 64.466 -635.06
## - race     2    0.3852 64.844 -634.86
## - nchild   1    0.1735 64.632 -634.09
## - marital  1    0.2529 64.712 -633.63
## - hrswork  1    0.3352 64.794 -633.15
## <none>                 64.459 -633.10
## - union    1    0.8507 65.310 -630.17
## - gender   1    2.9695 67.428 -618.16
## - insure   1    3.2085 67.667 -616.83
## - age      1    3.6070 68.066 -614.63
## - educ     1    8.6936 73.153 -587.53
## 
## Step:  AIC=-635.36
## log(wage) ~ age + educ + gender + hrswork + insure + metro + 
##     nchild + union + race + marital
## 
##           Df Sum of Sq    RSS     AIC
## - metro    1    0.0047 65.109 -637.33
## - race     2    0.3898 65.494 -637.11
## - marital  1    0.2128 65.317 -636.13
## - nchild   1    0.2301 65.334 -636.03
## - hrswork  1    0.2595 65.363 -635.86
## <none>                 65.104 -635.36
## - union    1    0.9538 66.058 -631.89
## - gender   1    2.9645 68.068 -620.61
## - insure   1    3.1964 68.300 -619.33
## - age      1    3.8440 68.948 -615.79
## - educ     1    8.9125 74.017 -589.11
## 
## Step:  AIC=-637.33
## log(wage) ~ age + educ + gender + hrswork + insure + nchild + 
##     union + race + marital
## 
##           Df Sum of Sq    RSS     AIC
## - race     2    0.3888 65.498 -639.09
## - marital  1    0.2092 65.318 -638.12
## - nchild   1    0.2470 65.356 -637.90
## - hrswork  1    0.2701 65.379 -637.77
## <none>                 65.109 -637.33
## - union    1    0.9591 66.068 -633.83
## - gender   1    2.9667 68.075 -622.57
## - insure   1    3.1916 68.300 -621.33
## - age      1    3.8903 68.999 -617.51
## - educ     1    9.0659 74.175 -590.31
## 
## Step:  AIC=-639.09
## log(wage) ~ age + educ + gender + hrswork + insure + nchild + 
##     union + marital
## 
##           Df Sum of Sq    RSS     AIC
## - marital  1    0.2076 65.705 -639.90
## - nchild   1    0.2704 65.768 -639.54
## - hrswork  1    0.2831 65.781 -639.47
## <none>                 65.498 -639.09
## - union    1    0.8531 66.351 -636.22
## - gender   1    3.2283 68.726 -623.00
## - insure   1    3.3402 68.838 -622.39
## - age      1    4.0068 69.504 -618.76
## - educ     1    9.7441 75.242 -588.94
## 
## Step:  AIC=-639.9
## log(wage) ~ age + educ + gender + hrswork + insure + nchild + 
##     union
## 
##           Df Sum of Sq    RSS     AIC
## - nchild   1    0.1604 65.866 -640.98
## - hrswork  1    0.2711 65.976 -640.35
## <none>                 65.705 -639.90
## - union    1    0.8237 66.529 -637.21
## - gender   1    3.3342 69.039 -623.29
## - insure   1    3.4490 69.154 -622.66
## - age      1    4.9450 70.650 -614.62
## - educ     1    9.8609 75.566 -589.32
## 
## Step:  AIC=-640.98
## log(wage) ~ age + educ + gender + hrswork + insure + union
## 
##           Df Sum of Sq    RSS     AIC
## - hrswork  1    0.2705 66.136 -641.44
## <none>                 65.866 -640.98
## - union    1    0.8129 66.678 -638.37
## - gender   1    3.3877 69.253 -624.12
## - insure   1    3.4395 69.305 -623.84
## - age      1    4.9657 70.831 -615.65
## - educ     1   10.9429 76.808 -585.19
## 
## Step:  AIC=-641.44
## log(wage) ~ age + educ + gender + insure + union
## 
##          Df Sum of Sq    RSS     AIC
## <none>                66.136 -641.44
## - union   1    0.8544 66.990 -638.62
## - gender  1    3.1282 69.264 -626.06
## - insure  1    3.1971 69.333 -625.69
## - age     1    4.7057 70.842 -617.60
## - educ    1   10.7339 76.870 -586.89

model_reduced_old <- step(model_full_old, direction="backward")

## Start:  AIC=-1133.57
## log(wage) ~ (age + educ + gender + hrswork + insure + metro + 
##     nchild + union + race + marital + region) - wage
## 
##           Df Sum of Sq    RSS      AIC
## - age      1     0.006 189.70 -1135.54
## - hrswork  1     0.126 189.82 -1135.03
## - union    1     0.206 189.90 -1134.70
## - nchild   1     0.281 189.98 -1134.38
## - race     2     0.784 190.48 -1134.25
## <none>                 189.69 -1133.57
## - marital  1     0.700 190.40 -1132.60
## - region   3     1.686 191.38 -1132.45
## - metro    1     2.167 191.86 -1126.43
## - insure   1     5.266 194.96 -1113.53
## - gender   1     6.221 195.91 -1109.59
## - educ     1    40.392 230.09  -980.17
## 
## Step:  AIC=-1135.54
## log(wage) ~ educ + gender + hrswork + insure + metro + nchild + 
##     union + race + marital + region
## 
##           Df Sum of Sq    RSS      AIC
## - hrswork  1     0.127 189.83 -1137.01
## - union    1     0.206 189.91 -1136.67
## - race     2     0.779 190.48 -1136.25
## - nchild   1     0.351 190.05 -1136.06
## <none>                 189.70 -1135.54
## - marital  1     0.715 190.41 -1134.52
## - region   3     1.683 191.38 -1134.44
## - metro    1     2.168 191.87 -1128.40
## - insure   1     5.291 194.99 -1115.40
## - gender   1     6.216 195.92 -1111.59
## - educ     1    40.460 230.16  -981.91
## 
## Step:  AIC=-1137.01
## log(wage) ~ educ + gender + insure + metro + nchild + union + 
##     race + marital + region
## 
##           Df Sum of Sq    RSS      AIC
## - union    1     0.201 190.03 -1138.15
## - race     2     0.790 190.62 -1137.66
## - nchild   1     0.327 190.15 -1137.62
## <none>                 189.83 -1137.01
## - marital  1     0.702 190.53 -1136.03
## - region   3     1.676 191.50 -1135.93
## - metro    1     2.217 192.04 -1129.66
## - insure   1     5.494 195.32 -1116.04
## - gender   1     6.745 196.57 -1110.90
## - educ     1    41.435 231.26  -980.07
## 
## Step:  AIC=-1138.15
## log(wage) ~ educ + gender + insure + metro + nchild + race + 
##     marital + region
## 
##           Df Sum of Sq    RSS      AIC
## - nchild   1     0.313 190.34 -1138.83
## - race     2     0.802 190.83 -1138.77
## <none>                 190.03 -1138.15
## - marital  1     0.721 190.75 -1137.11
## - region   3     1.788 191.82 -1136.61
## - metro    1     2.300 192.33 -1130.47
## - insure   1     5.667 195.69 -1116.50
## - gender   1     6.711 196.74 -1112.22
## - educ     1    41.245 231.27  -982.03
## 
## Step:  AIC=-1138.83
## log(wage) ~ educ + gender + insure + metro + race + marital + 
##     region
## 
##           Df Sum of Sq    RSS      AIC
## - race     2     0.944 191.29 -1138.84
## <none>                 190.34 -1138.83
## - marital  1     0.676 191.02 -1137.97
## - region   3     1.812 192.15 -1137.20
## - metro    1     2.219 192.56 -1131.50
## - insure   1     5.501 195.84 -1117.89
## - gender   1     6.642 196.98 -1113.22
## - educ     1    41.223 231.56  -983.02
## 
## Step:  AIC=-1138.84
## log(wage) ~ educ + gender + insure + metro + marital + region
## 
##           Df Sum of Sq    RSS      AIC
## <none>                 191.29 -1138.84
## - marital  1     0.809 192.09 -1137.45
## - region   3     1.852 193.14 -1137.09
## - metro    1     2.141 193.43 -1131.88
## - insure   1     5.640 196.93 -1117.45
## - gender   1     6.981 198.27 -1111.99
## - educ     1    41.607 232.89  -982.41

summary(model_reduced_young)

## 
## Call:
## lm(formula = log(wage) ~ age + educ + gender + insure + union, 
##     data = young)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.38577 -0.26592 -0.02603  0.24556  1.27938 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.655058   0.155906  10.616  < 2e-16 ***
## age          0.028131   0.005483   5.131 4.67e-07 ***
## educ         0.128586   0.016593   7.749 9.00e-14 ***
## gender      -0.191847   0.045859  -4.183 3.59e-05 ***
## insure       0.218750   0.051724   4.229 2.96e-05 ***
## union        0.156707   0.071678   2.186   0.0294 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4228 on 370 degrees of freedom
## Multiple R-squared:  0.3216, Adjusted R-squared:  0.3125 
## F-statistic: 35.08 on 5 and 370 DF,  p-value: < 2.2e-16

summary(model_reduced_old)

## 
## Call:
## lm(formula = log(wage) ~ educ + gender + insure + metro + marital + 
##     region, data = old)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.87787 -0.29972  0.02165  0.33262  1.29289 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      2.41277    0.07787  30.986  < 2e-16 ***
## educ             0.15494    0.01178  13.158  < 2e-16 ***
## gender          -0.18811    0.03490  -5.390 9.29e-08 ***
## insure           0.25329    0.05229   4.844 1.53e-06 ***
## metro            0.14017    0.04696   2.985  0.00292 ** 
## marital          0.05844    0.03185   1.835  0.06691 .  
## regionnortheast  0.05972    0.05313   1.124  0.26135    
## regionsouth      0.03501    0.04599   0.761  0.44672    
## regionwest       0.13263    0.04972   2.667  0.00780 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4902 on 796 degrees of freedom
## Multiple R-squared:  0.2699, Adjusted R-squared:  0.2626 
## F-statistic: 36.78 on 8 and 796 DF,  p-value: < 2.2e-16

Conclusion

This report showcases the application of fundamental statistical techniques to a small real-world dataset using R. Despite limitations such as small sample size, homogeneity, and lack of variability in certain variables, the exercise serves to demonstrate a variety of core analytical tools. Through exploratory analysis, basic probability theory, hypothesis testing, and regression modeling, we developed an understanding of the relationships within the dataset and the importance of data quality and quantity in statistical work. Key takeaways include the need for more diverse and extensive datasets for generalizable results, and the importance of context-aware statistical reasoning.

References

Luo, Y., Yan, J. and McClure, S., 2021. Distribution of the environmental and socioeconomic risk factors on COVID-19 death rate across continental USA: a spatial nonlinear analysis. Environmental Science and Pollution Research, 28(6), pp.6587-6599.

Luo, Y., Yan, J., McClure, S.C. and Li, F., 2022. Socioeconomic and environmental factors of poverty in China using geographically weighted random forest regression model. Environmental Science and Pollution Research, pp.1-13.

Rahman, M.M., Ali, G.M.N., Li, X.J., Samuel, J., Paul, K.C., Chong, P.H. and Yakubov, M., 2021. Socioeconomic factors analysis for COVID-19 US reopening sentiment with Twitter and census data. Heliyon, 7(2).

Sarkar, S., Dauer, M.J. and In, H., 2022. Socioeconomic disparities in gastric cancer and identification of a single SES variable for predicting risk. Journal of gastrointestinal cancer, pp.1-9.

R STUDIO Training - 05.06.2025 - SKUD-1

2025-06-06

Library Import

overview

Data Exploration

Probability

P(nchild ≥ 1 | married)

Probability distribution of nchild

Mean and Variance of nchild

Estimates, CI & Hypothesis Test

For 5+ children

Contingency table and test

Chi-square test

Point Estimates, Confidence Intervals & Hypothesis Testing

Simple Linear Regression

Simple Linear Regression: log(wage) ~ age

Scatter plots with regression lines

Simple Linear Regression

Multiple Linear Regression

Full model for young

Full model for old

Optional: Reduced model explanation

Conclusion

References

R STUDIO Training - 05.06.2025 - SKUD-1

2025-06-06

Library Import

overview

Data Exploration

Probability

P(nchild ≥ 1 | married)

Probability distribution of nchild

Mean and Variance of nchild

Estimates, CI & Hypothesis Test

For 5+ children

Contingency table and test

Chi-square test

Point Estimates, Confidence Intervals & Hypothesis Testing

Simple Linear Regression

** Simple Linear Regression: log(wage) ~ age**

Scatter plots with regression lines

Simple Linear Regression

Multiple Linear Regression

Full model for young

Full model for old

Optional: Reduced model explanation

Conclusion

References

Simple Linear Regression: log(wage) ~ age