#load package
library(tidyverse)── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.1 ✔ stringr 1.5.2
✔ ggplot2 4.0.0 ✔ tibble 3.3.0
✔ lubridate 1.9.4 ✔ tidyr 1.3.1
✔ purrr 1.1.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# upload and view data
df <- read_csv("C:/Users/grego/OneDrive/Desktop/MSc Artificial Intelligence/Data Science/CourseWork_7CSO30/data.csv")Rows: 27410 Columns: 9
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): Mar_Stat, Eth, Highest Ed
dbl (6): ID, Person_ID, Age, INC, Female, H8
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# view data
glimpse(df)Rows: 27,410
Columns: 9
$ ID <dbl> 37, 37, 37, 241, 242, 377, 418, 465, 465, 484, 484, 484, …
$ Person_ID <dbl> 1, 2, 3, 1, 1, 1, 1, 1, 2, 1, 2, 3, 4, 1, 2, 3, 4, 5, 1, …
$ Age <dbl> 20, 19, 19, 50, 29, 69, 59, 55, 47, 33, 26, 4, 2, 39, 36,…
$ Mar_Stat <chr> "Never married", "Never married", "Never married", "Never…
$ INC <dbl> 10000, 5300, 4700, 32500, 30000, 51900, 12200, 0, 2600, 1…
$ Female <dbl> 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, …
$ H8 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ Eth <chr> "White", "White", "Black", "White", "White", "White", "Wh…
$ `Highest Ed` <chr> "Some HE", "Some HE", "Some HE", "Masters or higher", "Ba…
head(df,10)# A tibble: 10 × 9
ID Person_ID Age Mar_Stat INC Female H8 Eth `Highest Ed`
<dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <chr> <chr>
1 37 1 20 Never married 10000 1 0 White Some HE
2 37 2 19 Never married 5300 1 0 White Some HE
3 37 3 19 Never married 4700 1 0 Black Some HE
4 241 1 50 Never married 32500 1 0 White Masters or h…
5 242 1 29 Never married 30000 1 0 White Bachelors De…
6 377 1 69 Never married 51900 1 0 White Less than Se…
7 418 1 59 Widowed 12200 1 0 White Less than Se…
8 465 1 55 Separated 0 0 0 Black Some HE
9 465 2 47 Never married 2600 1 0 Black Less than Se…
10 484 1 33 married 16800 0 0 Hispanic Secondary Sc…
# snap of numeric columns
summary(df) ID Person_ID Age Mar_Stat
Min. : 37 Min. : 1.000 Min. : 0.00 Length:27410
1st Qu.: 324129 1st Qu.: 1.000 1st Qu.:16.00 Class :character
Median : 617477 Median : 2.000 Median :35.00 Mode :character
Mean : 624437 Mean : 2.242 Mean :35.67
3rd Qu.: 938244 3rd Qu.: 3.000 3rd Qu.:51.00
Max. :1236779 Max. :16.000 Max. :93.00
INC Female H8 Eth
Min. : 0 Min. :0.0000 Min. :0.0000 Length:27410
1st Qu.: 6000 1st Qu.:0.0000 1st Qu.:0.0000 Class :character
Median : 18000 Median :1.0000 Median :0.0000 Mode :character
Mean : 27766 Mean :0.5138 Mean :0.1294
3rd Qu.: 35900 3rd Qu.:1.0000 3rd Qu.:0.0000
Max. :720000 Max. :1.0000 Max. :1.0000
NA's :6173
Highest Ed
Length:27410
Class :character
Mode :character