R Notebook- Dplyr Practice by LiJing

This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Dataset: incomedata.csv

Importing dataset & library

Checking dataset

# Preview
head(df)

# Check datatype
str(df)

## 'data.frame':    700 obs. of  4 variables:
##  $ agecat: chr  "31-44" "Up to 30" "31-44" "31-44" ...
##  $ educ  : chr  "Some college" "Did not complete high school" "Did not complete high school" "Did not complete high school" ...
##  $ gender: chr  "Female" "Male" "Male" "Male" ...
##  $ income: int  176 31 55 120 28 25 67 38 19 25 ...

# Check unique values
table(df$agecat)

## 
##    31-44  Over 44 Up to 30 
##      360       98      242

table(df$educ)

## 
##               College degree Did not complete high school 
##                           38                          372 
##           High school degree    Post-undergraduate degree 
##                          198                            5 
##                 Some college 
##                           87

table(df$gender)

## 
## Female   Male 
##    183    517

Practice 1

Select the variables age category (agecat) and gender for the subjects who did not complete high school

df1 <- df %>%
  filter( educ == 'Did not complete high school') %>%
  select('agecat', 'gender', 'educ')

head(df1)

Practice 2

Select the variables age category and gender for the subjects who did not complete high school and have the income greater than 20

df2 <- df %>%
  filter(educ=='Did not complete high school', income>20) %>%
  select('agecat', 'gender')

head(df2)

Practice 3

Select the male subjects with the income lower than 65 and compute their average income

(average_income <- df %>%
  filter(gender=='Male', income<65) %>%
  summarise(avginc=mean(income)))

Practice 4

Group the subjects by age category and compute the mean and the standard deviation of the income

(df4 <- df %>%
  group_by(agecat) %>%
  summarise(avgincome=mean(income), stdincome=sd(income)))

Practice 5

Sort the data frame by income, in ascending and descending order

# Sort ascending
head(arrange(df, income))

# Sort descending
head(arrange(df, desc(income)))

Practice 6

Count the number of subjects in each education category

count(df, educ)

Practice 7

Select 150 subjects at random, with and without replacement

# With replacement
slice_sample(df, n = 100, replace = TRUE)

# Without replacement
slice_sample(df, n = 100, replace = FALSE)

Practice 8

Select 25% of the subjects at random, with and without replacement

# With replacement
slice_sample(df, prop = 0.5, replace = TRUE)

# Without replacement
slice_sample(df, prop = 0.5, replace = FALSE)