Reading dataset

Before reading a dataset from a folder we need to make sure the dataset is in the working directory. We can set working directory using different methods:

Session > Set Working Directory …..
Typing command
Files tab in the Environment Pane

setwd("D:/Google Drive/11 R_Tutorial_PSTU/R Course MS PhD")

data = readxl::read_excel('DataSets.xlsx', sheet = 'wrangling', range = 'B7:N507')
head(data)

## # A tibble: 6 × 13
##     sr.   jan   feb   mar   apr   may   jun   jul   aug   sep   oct   nov   dec
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1     1     0     1     1     0     0     0     0     0     0     0     0     0
## 2     2     0     1     1     1     0     0     0     0     0     0     0     0
## 3     3     1     0     1     0     1     0     0     0     0     0     0     0
## 4     4     0     0     1     1     0     1     0     0     1     0     0     0
## 5     5     1     0     0     0     0     0     0     0     0     1     0     0
## 6     6     0     1     0     0     0     0     0     0     0     0     0     0

tail(data)

## # A tibble: 6 × 13
##     sr.   jan   feb   mar   apr   may   jun   jul   aug   sep   oct   nov   dec
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1   495     0     1     0     1     0     0     1     0     0     0     0     0
## 2   496     0     1     1     0     0     0     0     0     0     0     0     0
## 3   497     1     0     1     1     1     0     0     0     0     0     0     0
## 4   498     0     0     0     1     1     0     1     0     0     0     0     0
## 5   499     1     0     0     0     0     0     0     0     0     0     0     1
## 6   500     0     0     0     1     0     0     0     0     0     0     0     0

Use shortcut to insert R command: ctrl+alt+I

str(data)

## tibble [500 × 13] (S3: tbl_df/tbl/data.frame)
##  $ sr.: num [1:500] 1 2 3 4 5 6 7 8 9 10 ...
##  $ jan: num [1:500] 0 0 1 0 1 0 0 0 1 0 ...
##  $ feb: num [1:500] 1 1 0 0 0 1 1 0 0 0 ...
##  $ mar: num [1:500] 1 1 1 1 0 0 0 0 1 1 ...
##  $ apr: num [1:500] 0 1 0 1 0 0 1 0 0 1 ...
##  $ may: num [1:500] 0 0 1 0 0 0 1 0 1 0 ...
##  $ jun: num [1:500] 0 0 0 1 0 0 0 0 1 1 ...
##  $ jul: num [1:500] 0 0 0 0 0 0 0 0 0 0 ...
##  $ aug: num [1:500] 0 0 0 0 0 0 0 1 0 0 ...
##  $ sep: num [1:500] 0 0 0 1 0 0 0 0 0 0 ...
##  $ oct: num [1:500] 0 0 0 0 1 0 0 0 0 0 ...
##  $ nov: num [1:500] 0 0 0 0 0 0 0 1 0 0 ...
##  $ dec: num [1:500] 0 0 0 0 0 0 0 0 0 0 ...

summary(data)

##       sr.             jan             feb             mar       
##  Min.   :  1.0   Min.   :0.000   Min.   :0.000   Min.   :0.000  
##  1st Qu.:125.8   1st Qu.:0.000   1st Qu.:0.000   1st Qu.:0.000  
##  Median :250.5   Median :0.000   Median :0.000   Median :0.000  
##  Mean   :250.5   Mean   :0.286   Mean   :0.354   Mean   :0.392  
##  3rd Qu.:375.2   3rd Qu.:1.000   3rd Qu.:1.000   3rd Qu.:1.000  
##  Max.   :500.0   Max.   :1.000   Max.   :1.000   Max.   :1.000  
##       apr             may             jun             jul            aug      
##  Min.   :0.000   Min.   :0.000   Min.   :0.000   Min.   :0.00   Min.   :0.00  
##  1st Qu.:0.000   1st Qu.:0.000   1st Qu.:0.000   1st Qu.:0.00   1st Qu.:0.00  
##  Median :0.000   Median :0.000   Median :0.000   Median :0.00   Median :0.00  
##  Mean   :0.358   Mean   :0.224   Mean   :0.218   Mean   :0.18   Mean   :0.09  
##  3rd Qu.:1.000   3rd Qu.:0.000   3rd Qu.:0.000   3rd Qu.:0.00   3rd Qu.:0.00  
##  Max.   :1.000   Max.   :1.000   Max.   :1.000   Max.   :1.00   Max.   :1.00  
##       sep             oct            nov             dec       
##  Min.   :0.000   Min.   :0.00   Min.   :0.000   Min.   :0.000  
##  1st Qu.:0.000   1st Qu.:0.00   1st Qu.:0.000   1st Qu.:0.000  
##  Median :0.000   Median :0.00   Median :0.000   Median :0.000  
##  Mean   :0.082   Mean   :0.08   Mean   :0.076   Mean   :0.042  
##  3rd Qu.:0.000   3rd Qu.:0.00   3rd Qu.:0.000   3rd Qu.:0.000  
##  Max.   :1.000   Max.   :1.00   Max.   :1.000   Max.   :1.000

We will subset the dataset by excluding the Sr. variable (1st variable)

data1 = data[ , -1]
head(data1)

## # A tibble: 6 × 12
##     jan   feb   mar   apr   may   jun   jul   aug   sep   oct   nov   dec
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1     0     1     1     0     0     0     0     0     0     0     0     0
## 2     0     1     1     1     0     0     0     0     0     0     0     0
## 3     1     0     1     0     1     0     0     0     0     0     0     0
## 4     0     0     1     1     0     1     0     0     1     0     0     0
## 5     1     0     0     0     0     0     0     0     0     1     0     0
## 6     0     1     0     0     0     0     0     0     0     0     0     0

Calculate the column frequencies (summation) using colSums()

colSums(data1)

## jan feb mar apr may jun jul aug sep oct nov dec 
## 143 177 196 179 112 109  90  45  41  40  38  21

Create an object to store the value of summation.

sums = as.data.frame(colSums(data1))
str(sums)

## 'data.frame':    12 obs. of  1 variable:
##  $ colSums(data1): num  143 177 196 179 112 109 90 45 41 40 ...

rownames(sums)

##  [1] "jan" "feb" "mar" "apr" "may" "jun" "jul" "aug" "sep" "oct" "nov" "dec"

Piping operation: %>% (ctrl+shif+M). It uses the previous output as an input for the next calculation.

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

sums1 = sums %>% rownames_to_column('Months') 
sums1

##    Months colSums(data1)
## 1     jan            143
## 2     feb            177
## 3     mar            196
## 4     apr            179
## 5     may            112
## 6     jun            109
## 7     jul             90
## 8     aug             45
## 9     sep             41
## 10    oct             40
## 11    nov             38
## 12    dec             21

sums2 = sums1 %>% rename(Frequency = `colSums(data1)`)
sums2

##    Months Frequency
## 1     jan       143
## 2     feb       177
## 3     mar       196
## 4     apr       179
## 5     may       112
## 6     jun       109
## 7     jul        90
## 8     aug        45
## 9     sep        41
## 10    oct        40
## 11    nov        38
## 12    dec        21

# Change order of the x-axis items
sums2$Months = factor(sums2$Months, 
                      levels = c('jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec' ), 
                      labels = c('Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec' ))

ggplot(sums2)+
  aes(x = Months, y = Frequency, color = Months, fill = Months)+
  geom_col()+
  theme_bw()+
  labs(title = 'Months of money shortage',
       x = '',
       y = 'Number of respondents')+
  theme(legend.position = '')

# ggsave('My plot.png', dpi = 600, height = 4, width = 8, units = 'in')

Categorize the respondents based on their number of months having money shortage.

shortage = as.data.frame(rowSums(data1))
short = shortage %>% rename(frequency = `rowSums(data1)`)
str(short)

## 'data.frame':    500 obs. of  1 variable:
##  $ frequency: num  2 3 3 4 2 1 3 2 4 3 ...

head(short)

##   frequency
## 1         2
## 2         3
## 3         3
## 4         4
## 5         2
## 6         1

Some descriptives of this frequency variable.

summary(short)

##    frequency    
##  Min.   :0.000  
##  1st Qu.:1.000  
##  Median :2.000  
##  Mean   :2.382  
##  3rd Qu.:3.000  
##  Max.   :9.000

To see the frequency, we need categorical (factor) variable. Convert the numeric variable to factor variable.

short$frequency = as.factor(short$frequency)
summary(short)

##    frequency  
##  2      :141  
##  3      :113  
##  1      :110  
##  4      : 69  
##  0      : 34  
##  5      : 21  
##  (Other): 12

Categorize the respondents based on their number of months with money shortage:

Two principles: Mutually exclusive, Completely exhaustive

Low = up to 3 months of money shortage

Medium = 4 to 5 months of money shortage

High = >5 months of money shortage

For categorization we need numeric (continuous or discrete) variables.

Right = TRUE, means right value closed, right value is included in the category

short$frequency = as.numeric(short$frequency)
category = cut(short$frequency, breaks = c(-Inf, 3, 5, Inf), 
    labels = c('Low', 'Medium', 'High'),
    right = TRUE)
table(category)

## category
##    Low Medium   High 
##    285    182     33

# calculate the percentage
table(category)*100/500

## category
##    Low Medium   High 
##   57.0   36.4    6.6

R markdown, Basic functions, Package (tidyverse) installation, Visualization using ggplot()

Md Kamrul Hasan

Reading dataset