HW-2

Clean Data - Navy

Felix Betancourt
2022-06-13

Setting up and Reading the data

First let’s set the working directory.

#First of all let's load the Dplyr package

library(dplyr)

# Now let's set the working directory

setwd("/Users/fbeta/OneDrive/1-UMASS-DACSS/601/Week-4/HW-2")

# confirm the WD

print(getwd())
[1] "C:/Users/fbeta/OneDrive/1-UMASS-DACSS/601/Week-4/HW-2"

Load the data

# Reading the file

navy <- read.csv("navy_cleaned-navy_cleaned.csv")

Exploring the data

#let's have a preview of the data in the file

head(navy)
  enlisted pay_grade single.withoutchildren.male
1        E         1                        7820
2        E         2                       11198
3        E         3                       28163
4        E         4                       23285
5        E         5                       18856
6        E         6                        5917
  single.withoutchildren.female single.withoutchildren.total
1                          2275                        10095
2                          2718                        13916
3                          6396                        34559
4                          4266                        27551
5                          3649                        22505
6                          1429                         7346
  single.withchildren.male single.withchildren.female
1                      117                         34
2                      237                         87
3                      681                        788
4                      967                        949
5                     2837                       1787
6                     2725                       1284
  single.withchildren.total married.jointservice.male
1                       151                        30
2                       324                       113
3                      1469                       861
4                      1916                      1526
5                      4624                      2191
6                      4009                      1654
  married.jointservice.female married.jointservice.total
1                          57                         87
2                         164                        277
3                        1288                       2149
4                        1691                       3217
5                        2099                       4290
6                        1461                       3115
  married.civilian.female married.civilian.male
1                     806                   162
2                    2474                   388
3                   11297                  1791
4                   16277                  1768
5                   31491                  2272
6                   30404                  1561
  married.civilian.total married.male.total married.female.total
1                    968               8773                 2528
2                   2862              14022                 3357
3                  13088              41002                10263
4                  18045              42055                 8674
5                  33763              55375                 9807
6                  31965              40700                 5735
  married.total.total branch
1               11301   Navy
2               17379   Navy
3               51265   Navy
4               50729   Navy
5               65182   Navy
6               46435   Navy

By interpreting the data, seems that we have mainly 3 types of data among the variables:

  1. Categorical:
  1. Ordinal:
  1. Discrete numerical:

However let’s check how R is reading the data.

# check if it is numeric value
is.numeric("single.withoutchildren.male")
[1] FALSE
is.numeric("enlisted")
[1] FALSE
is.numeric("pay_grade")
[1] FALSE
is.character("single.withoutchildren.male")
[1] TRUE
is.character("enlisted")
[1] TRUE
is.character("pay_grade")
[1] TRUE

Seems that any of the discrete numeric variables are being read as character.

Therefore we need to convert the Discrete numeric variable as numeric:

# converting a variable to numeric value
swochm2 <- as.numeric(as.character("single.withoutchildren.male"))

# check if it is numeric value
is.numeric(swochm2)
[1] TRUE

Let’s explore the data a bit more:

# Dimension of the table
dim(navy)
[1] 23 18

It has 23 rows and 18 columns/variables

#Get the col names of navy
colnames(navy)
 [1] "enlisted"                      "pay_grade"                    
 [3] "single.withoutchildren.male"   "single.withoutchildren.female"
 [5] "single.withoutchildren.total"  "single.withchildren.male"     
 [7] "single.withchildren.female"    "single.withchildren.total"    
 [9] "married.jointservice.male"     "married.jointservice.female"  
[11] "married.jointservice.total"    "married.civilian.female"      
[13] "married.civilian.male"         "married.civilian.total"       
[15] "married.male.total"            "married.female.total"         
[17] "married.total.total"           "branch"                       

Now let’s create some tables

Select specific columns

#Selecting Enlisted variable only
select(navy,1)
   enlisted
1         E
2         E
3         E
4         E
5         E
6         E
7         E
8         E
9         E
10        O
11        O
12        O
13        O
14        O
15        O
16        O
17        O
18        O
19        O
20        W
21        W
22        W
23        W

And create a table with specific variables

#Table for pay_grade and enlistment
table(select(navy,2,1))
         enlisted
pay_grade E O W
       1  1 1 0
       2  1 1 1
       3  1 1 1
       4  1 1 1
       5  1 1 1
       6  1 1 0
       7  1 1 0
       8  1 1 0
       9  1 1 0
       10 0 1 0