Eksplorasi data wooldridge

Author

agatha

eksplorasi data

library(wooldridge)
library(ggplot2)
data("apple")
str(apple)
'data.frame':   660 obs. of  17 variables:
 $ id      : int  10002 10004 10034 10035 10039 10041 10046 10048 10050 10057 ...
 $ educ    : int  16 16 18 12 15 12 14 16 17 12 ...
 $ date    : chr  "111597" "121897" "111097" "111597" ...
 $ state   : chr  "SD" "KS" "MI" "TN" ...
 $ regprc  : num  1.19 0.59 0.59 0.89 0.89 ...
 $ ecoprc  : num  1.19 0.79 0.99 1.09 1.09 ...
 $ inseason: int  1 0 1 1 0 1 1 0 1 1 ...
 $ hhsize  : int  4 1 3 2 1 4 5 4 1 2 ...
 $ male    : int  0 0 0 1 1 0 0 0 1 0 ...
 $ faminc  : int  45 65 65 55 25 15 25 45 105 25 ...
 $ age     : int  43 37 44 55 22 34 72 38 50 69 ...
 $ reglbs  : num  2 0 0 3 0 0 0 0 0 0 ...
 $ ecolbs  : num  2 2 2.67 0 3 ...
 $ numlt5  : int  0 0 0 0 0 0 0 0 0 0 ...
 $ num5_17 : int  1 0 2 0 0 2 1 2 0 0 ...
 $ num18_64: int  3 1 1 2 1 2 2 2 1 0 ...
 $ numgt64 : int  0 0 0 0 0 0 2 0 0 2 ...
 - attr(*, "time.stamp")= chr "25 Jun 2011 23:03"
summary(apple)
       id             educ           date              state          
 Min.   :10002   Min.   : 8.00   Length:660         Length:660        
 1st Qu.:10800   1st Qu.:12.00   Class :character   Class :character  
 Median :11692   Median :14.00   Mode  :character   Mode  :character  
 Mean   :11729   Mean   :14.38                                        
 3rd Qu.:12600   3rd Qu.:16.00                                        
 Max.   :13921   Max.   :20.00                                        
     regprc           ecoprc         inseason          hhsize     
 Min.   :0.5900   Min.   :0.590   Min.   :0.0000   Min.   :1.000  
 1st Qu.:0.5900   1st Qu.:0.890   1st Qu.:0.0000   1st Qu.:2.000  
 Median :0.8900   Median :1.090   Median :0.0000   Median :3.000  
 Mean   :0.8827   Mean   :1.082   Mean   :0.3364   Mean   :2.941  
 3rd Qu.:1.1900   3rd Qu.:1.290   3rd Qu.:1.0000   3rd Qu.:4.000  
 Max.   :1.1900   Max.   :1.590   Max.   :1.0000   Max.   :9.000  
      male            faminc            age            reglbs      
 Min.   :0.0000   Min.   :  5.00   Min.   :19.00   Min.   : 0.000  
 1st Qu.:0.0000   1st Qu.: 25.00   1st Qu.:33.00   1st Qu.: 0.000  
 Median :0.0000   Median : 45.00   Median :43.00   Median : 0.000  
 Mean   :0.2621   Mean   : 53.41   Mean   :44.52   Mean   : 1.282  
 3rd Qu.:1.0000   3rd Qu.: 65.00   3rd Qu.:53.00   3rd Qu.: 2.000  
 Max.   :1.0000   Max.   :250.00   Max.   :88.00   Max.   :42.000  
     ecolbs           numlt5          num5_17          num18_64    
 Min.   : 0.000   Min.   :0.0000   Min.   :0.0000   Min.   :0.000  
 1st Qu.: 0.000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:1.000  
 Median : 1.000   Median :0.0000   Median :0.0000   Median :2.000  
 Mean   : 1.474   Mean   :0.2864   Mean   :0.6212   Mean   :1.805  
 3rd Qu.: 2.000   3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:2.000  
 Max.   :42.000   Max.   :4.0000   Max.   :6.0000   Max.   :7.000  
    numgt64      
 Min.   :0.0000  
 1st Qu.:0.0000  
 Median :0.0000  
 Mean   :0.2288  
 3rd Qu.:0.0000  
 Max.   :3.0000  

sebaran data untuk dua variabel age dan id

ggplot(apple, aes(x = age, y = id)) +
  geom_point() +
  labs(x = "Age", y = "Id",
       title = "Scatter Plot of Age and Id") +
  theme_minimal()

korelasi antara age dan ecoprc

cor(apple$age, apple$ecoprc, use = "complete.obs")
[1] 0.07202745

menghitung jumlah nilai yang hilang

sum(is.na(apple))
[1] 0

histogram dari age

ggplot(apple, aes(x = age)) +
  geom_histogram(binwidth = 1, fill = "purple", color = "black") +
  labs(x = "Age", y = "Frequency",
       title = "Histogram of Age") +
  theme_minimal()

boxplot dari ecoprc

ggplot(apple, aes(y = ecoprc)) +
  geom_boxplot() +
  labs(y = "Ecoprc",
       title = "Boxplot of Ecoprc") +
  theme_minimal()

boxplot dari ecoprc berdasarkan faminc

ggplot(apple, aes(x = faminc, y = ecoprc)) +
  geom_boxplot() +
  labs(x = "Faminc", y = "Ecoprc",
       title = "Boxplot of Faminc by Region") +
  theme_minimal()
Warning: Continuous x aesthetic
ℹ did you forget `aes(group = ...)`?

anova dari ecoprc berdasarkan faminc Jika nilai p-value kurang dari 0.05, berarti terdapat perbedaan yang signifikan.

anova_result <- aov(ecoprc ~ faminc, data = apple)
summary(anova_result)
             Df Sum Sq Mean Sq F value Pr(>F)
faminc        1   0.17 0.17100    1.96  0.162
Residuals   658  57.40 0.08724               

analisis regresi

reg_model <- lm(ecoprc ~ hhsize + faminc + age, data = apple)
summary(reg_model)

Call:
lm(formula = ecoprc ~ hhsize + faminc + age, data = apple)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.53393 -0.22334  0.00495  0.22194  0.60577 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  1.0392065  0.0525410  19.779   <2e-16 ***
hhsize       0.0012237  0.0079859   0.153    0.878    
faminc      -0.0004093  0.0003240  -1.263    0.207    
age          0.0013604  0.0007988   1.703    0.089 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.2951 on 656 degrees of freedom
Multiple R-squared:  0.007603,  Adjusted R-squared:  0.003065 
F-statistic: 1.675 on 3 and 656 DF,  p-value: 0.171

perubahan dalam hhsize mempengaruhi ecoprc

ggplot(apple, aes(x = hhsize, y = ecoprc)) +
  geom_point() +
  geom_smooth(method = "lm", col = "pink") +
  labs(x = "hhsize", y = "Ecoprc",
       title = "Scatter Plot of hhsize and Ecoprc with Regression Line") +
  theme_minimal()
`geom_smooth()` using formula = 'y ~ x'

histogram dari ecoprc

ggplot(apple, aes(x = ecoprc)) +
  geom_histogram(aes(y = ..density..), binwidth = 0.1, fill = "blue", color = "black") +
  geom_density(color = "red", size = 1) +
  labs(x = "Ecoprc", y = "Density", title = "Histogram dan Density Plot dari Ecoprc") +
  theme_minimal()
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
ℹ Please use `after_stat(density)` instead.

q-q plot dari ecoprc

ggplot(apple, aes(sample = ecoprc)) +
  stat_qq() +
  stat_qq_line() +
  labs(x = "Theoretical Quantiles", y = "Sample Quantiles", title = "Q-Q Plot dari Ecoprc") +
  theme_minimal()

shapiro wilk test

shapiro_test <- shapiro.test(apple$ecoprc)
shapiro_test

    Shapiro-Wilk normality test

data:  apple$ecoprc
W = 0.94939, p-value = 2.983e-14

menghitung rata-rata dari ecoprc

mean_ecoprc <- mean(apple$ecoprc, na.rm = TRUE)
mean_ecoprc
[1] 1.081515

menghitung varians

var_ecoprc <- var(apple$ecoprc, na.rm = TRUE)
var_ecoprc
[1] 0.08736341

menghitung standar deviasi

sd_ecoprc <- sd(apple$ecoprc, na.rm = TRUE)
sd_ecoprc
[1] 0.295573