## title: Introduction to R
## author: Professor Dr. Md. Professor Dr. Md. Kamrul Hasanl Hasan
# This is a comment starting with a hash (#) symbol.
# Comments are not executed by R, they are just for human readers.
# R is a programming language and environment for statistical computing and graphics.
# It is a free software package that runs on a wide variety of UNIX platforms, Windows and MacOS.
# R is a powerful tool for data analysis and visualization.
# R is an interpreted language, which means that you can type commands directly into the console and get immediate results.
# You can also write scripts in a text editor and run them in R.
# Writing scripts is useful for several reasons:
# 1. Scripts can be saved and reused later.
# 2. Scripts can be shared with others.
# In this tutorial, we will learn how to write scripts in R.
# R script
# Open a directory or folder where you want to save your R script.
# Your datasets should also be saved in the same directory.
# Create a new text file and save it with a .R extension.
# If you do not see the file extension, make sure to enable file extensions in your operating system settings.
# Go to the folder options and uncheck the box that says "Hide extensions for known file types" under View tab.
# If you have saved the file with .txt extension by mistake, you can rename it to .R extension.
# Open the created .R file (R Script) with RStudio.
# Now you are ready for scripting in R.
# More conveniently, you can open a new R script in RStudio by going to File -> New File -> R Script.
# Save this R script to your desired folder (working directory) by going to File -> Save As.
# Working directory: This is the folder that stores scripts and datasets.
# You can check the current working directory using the getwd() function.
# Do not forget to run by hitting Ctrl+Enter or clicking the Run button.
# You can select a particular part in a line and run it by hitting Ctrl+Enter.
getwd()
## [1] "D:/Google Drive/11 R_Tutorial_PSTU/Machine Learning/Data and scripts"
# See the output in the console.
# This is a R function: function name always followed by parentheses. Here, getwd is the function name.
# The brackets must match, so if you open a bracket, you must close it.
# Three types of brackets are used in R: (elements), {statements}, and [indices].
# Files tab in the Environment pane shows the files in the current working directory.
# You can browse the desired folder in any location by clicking the "..." button in the Files tab.
# You can set the opened folder as the working directory by clicking the "More" button (gear icon) and selecting "Set As Working Directory".
# You can also set the working directory by going to Session -> Set Working Directory -> Choose Directory.
# If you want to set the working directory to the folder containing the current script, you can use the go to Session -> Set Working Directory -> Source File Location.
# To clear the environment
# rm(list = ls())
# To clear the console
# Ctrl + L
# Pipe operator %>%
# Ctrl + Shift + M
# Comment out
# Ctrl + Shift + C
# Set working directory
# setwd('directory path ....')
# Operators in R
# Mathematical operators: +, -, *, /, ^ (exponentiation)
2 + 5 # Addition
## [1] 7
5 - 2 # Subtraction
## [1] 3
2 * 5 # Multiplication
## [1] 10
5 / 2 # Division
## [1] 2.5
5^2 # Exponentiation
## [1] 25
# Logical operators: <, >, <=, >=, ==, != (check a statement is true or false)
5 > 2 # Greater than
## [1] TRUE
5 < 2 # Less than
## [1] FALSE
5 >= 2 # Greater than or equal to
## [1] TRUE
5 <= 2 # Less than or equal to
## [1] FALSE
5 == 2 # Equal to
## [1] FALSE
5 != 2 # Not equal to
## [1] TRUE
# Modulus operator: %% (remainder of the division), %/% (quotient of the division)
5 %% 2 # Remainder of the division
## [1] 1
5 %/% 2 # Quotient of the division
## [1] 2
# Assignment operator: <- or = (assign a value to a variable)
# Creating a variable or object named x and assigning the value 5 to it.
x <- 5 # Assign 5 to x
x = 5 # Assign 5 to x
x # Print the value of x
## [1] 5
y = 2
y # Print the value of y
## [1] 2
x - y # Subtract y from x
## [1] 3
x*y # Multiply x by y
## [1] 10
# R is case-sensitive, so x and X are different variables.
X = 10
X # Print the value of X
## [1] 10
# Check whether x and X are the same
x == X
## [1] FALSE
x != X
## [1] TRUE
# You can also assign the result of an operation to a variable.
z = x + y
z # Print the value of z
## [1] 7
# Trigonometric functions: sin(), cos(), tan(), asin(), acos(), atan()
tan(45) # Tangent of 45 radian, R function uses radian as the default unit
## [1] 1.619775
# Convert 45 radian to degree
deg = 45
rad = deg*pi/180
rad
## [1] 0.7853982
# Check the tangent this degree
tan(rad)
## [1] 1
# Logarithmic functions: log(), log10(), exp()
log(100) # Natural logarithm of 10
## [1] 4.60517
log10(1000) # Base 10 logarithm of 10
## [1] 3
exp(2) # Exponential function, e^2
## [1] 7.389056
exp(log(100)) # Anti natural log
## [1] 100
10^(log10(1000)) # Anti 10-base log
## [1] 1000
# Square root function: sqrt()
sqrt(25) # Square root of 25
## [1] 5
# Rounding functions: round(), floor(), ceiling()
pi
## [1] 3.141593
round(pi, 2) # Round pi to 2 decimal places
## [1] 3.14
floor(pi) # Round down to the nearest integer
## [1] 3
ceiling(pi) # Round up to the nearest integer
## [1] 4
# Absolute value function: abs()
abs(-5) # Absolute value of -5
## [1] 5
# set.seed() function is used to set the seed for random number generation.
# This is useful for reproducibility of results, i.e. same random numbers are generated each time.
# Random number generation: runif(), rnorm()
set.seed(123) # Set the seed for random number generation
runif(5) # Generate 5 random numbers from a uniform distribution [events which are equally likely]
## [1] 0.2875775 0.7883051 0.4089769 0.8830174 0.9404673
rnorm(5) # Generate 5 random numbers from a normal distribution [bell-shaped curve]
## [1] -1.6895557 1.2394959 -0.1089660 -0.1172420 0.1830826
# Help function: help()
# help(rnorm) # Get help on the rnorm function
# ?rnorm # shortcut for help(rnorm)
# c(), seq(), rep(), sample() functions
# c() function is used to combine values into a vector.
# seq() function is used to create a sequence of numbers.
# rep() function is used to repeat values.
# sample() function is used to randomly sample values.
c(1, 2, 3, 4, 5) # Combine values into a vector
## [1] 1 2 3 4 5
c(1:10) # Combine values from 1 to 10 into a vector
## [1] 1 2 3 4 5 6 7 8 9 10
seq(1, 10, by = 2) # Create a sequence from 1 to 10 with a step of 2
## [1] 1 3 5 7 9
rep(1, 5) # Repeat 1 five times
## [1] 1 1 1 1 1
sample(1:10, 5) # Randomly sample 5 values from 1 to 10
## [1] 5 3 9 1 4
# R has many built-in datasets that can be used for practice.
# You can see the list of built-in datasets using the data() function.
# You can load a dataset using the data(dataset name) function.
data(cars) # Load the cars dataset
head(cars) # Display the first 6 rows of the dataset
## speed dist
## 1 4 2
## 2 4 10
## 3 7 4
## 4 7 22
## 5 8 16
## 6 9 10
tail(cars) # Display the last 6 rows of the dataset
## speed dist
## 45 23 54
## 46 24 70
## 47 24 92
## 48 24 93
## 49 24 120
## 50 25 85
head(cars, 10)
## speed dist
## 1 4 2
## 2 4 10
## 3 7 4
## 4 7 22
## 5 8 16
## 6 9 10
## 7 10 18
## 8 10 26
## 9 10 34
## 10 11 17
# You can see the whole dataset from the environment pane by clicking the dataset name.
# Let us create a dataset with the data.frame() function.
# This dataset has five columns (variables): serial, age (normally distributed with 45 mean and 3 SD), family, income, land, and 20 rows (observations).
# The dataset is stored in the object named mydata.
# First, create the variables.
serial = 1:20
serial
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
set.seed(123) # Set the seed for random number generation
(age = rnorm(20, mean = 45, sd = 3)) # Enclose with brackets to save and print the values
## [1] 43.31857 44.30947 49.67612 45.21153 45.38786 50.14519 46.38275 41.20482
## [9] 42.93944 43.66301 48.67225 46.07944 46.20231 45.33205 43.33248 50.36074
## [17] 46.49355 39.10015 47.10407 43.58163
(family = sample(1:5, 20, replace = TRUE))
## [1] 2 5 1 1 2 3 4 5 5 3 1 2 5 5 4 5 2 1 1 3
(income = runif(20, min = 1000, max = 5000))
## [1] 2759.327 4017.901 3516.885 3840.730 1002.499 2901.266 1880.476 2519.266
## [9] 3451.084 2407.192 1444.542 1974.478 3672.222 2670.587 4152.783 1411.459
## [17] 2739.571 4939.828 4572.204 4545.876
(land = sample(c("urban", "rural"), 20, replace = TRUE))
## [1] "urban" "rural" "rural" "urban" "rural" "rural" "urban" "urban" "rural"
## [10] "urban" "urban" "urban" "rural" "rural" "urban" "rural" "rural" "urban"
## [19] "urban" "urban"
# Combine the variables into a dataset using the data.frame() function.
mydata = data.frame(serial, age, family, income, land)
mydata
## serial age family income land
## 1 1 43.31857 2 2759.327 urban
## 2 2 44.30947 5 4017.901 rural
## 3 3 49.67612 1 3516.885 rural
## 4 4 45.21153 1 3840.730 urban
## 5 5 45.38786 2 1002.499 rural
## 6 6 50.14519 3 2901.266 rural
## 7 7 46.38275 4 1880.476 urban
## 8 8 41.20482 5 2519.266 urban
## 9 9 42.93944 5 3451.084 rural
## 10 10 43.66301 3 2407.192 urban
## 11 11 48.67225 1 1444.542 urban
## 12 12 46.07944 2 1974.478 urban
## 13 13 46.20231 5 3672.222 rural
## 14 14 45.33205 5 2670.587 rural
## 15 15 43.33248 4 4152.783 urban
## 16 16 50.36074 5 1411.459 rural
## 17 17 46.49355 2 2739.571 rural
## 18 18 39.10015 1 4939.828 urban
## 19 19 47.10407 1 4572.204 urban
## 20 20 43.58163 3 4545.876 urban
# Save this dataset using .csv, .txt, or .xlsx file formats.
write.csv(mydata, "mydata.csv") # Save the dataset as a .csv file
write.table(mydata, "mydata.txt") # Save the dataset as a .txt file
# write.xlsx(mydata, "mydata.xlsx") # Save the dataset as a .xlsx file
# Could not find the function write.xlsx error occurs because the write.xlsx() function is not available in base R. You need to install the openxlsx package to use this function.
# You can install the openxlsx package using the install.packages("openxlsx") command.
# You can install packages using the Packages tab in the lower right pane of RStudio.
# You can also install packages using the Tools -> Install Packages... menu in RStudio.
# after installation, you must load the package or library using the library(openxlsx) command.
library(openxlsx) # Load the openxlsx package
write.xlsx(mydata, "mydata.xlsx") # Save the dataset as a .xlsx file
# Reading the external dataset
# You can also load datasets from external files.
# R can read data from various file formats such as .csv, .txt, .xlsx, .sav, etc.
# You can use the read.csv(), read.table(), read_xlsx() functions to read data from files.
# You can use import dataset button in the Environment pane to import datasets. You can also use the File menu to import datasets.
# Read the dataset from the .csv file
mydata_csv = read.csv("mydata.csv")
mydata_csv
## X serial age family income land
## 1 1 1 43.31857 2 2759.327 urban
## 2 2 2 44.30947 5 4017.901 rural
## 3 3 3 49.67612 1 3516.885 rural
## 4 4 4 45.21153 1 3840.730 urban
## 5 5 5 45.38786 2 1002.499 rural
## 6 6 6 50.14519 3 2901.266 rural
## 7 7 7 46.38275 4 1880.476 urban
## 8 8 8 41.20482 5 2519.266 urban
## 9 9 9 42.93944 5 3451.084 rural
## 10 10 10 43.66301 3 2407.192 urban
## 11 11 11 48.67225 1 1444.542 urban
## 12 12 12 46.07944 2 1974.478 urban
## 13 13 13 46.20231 5 3672.222 rural
## 14 14 14 45.33205 5 2670.587 rural
## 15 15 15 43.33248 4 4152.783 urban
## 16 16 16 50.36074 5 1411.459 rural
## 17 17 17 46.49355 2 2739.571 rural
## 18 18 18 39.10015 1 4939.828 urban
## 19 19 19 47.10407 1 4572.204 urban
## 20 20 20 43.58163 3 4545.876 urban
# Read this dataset excluding the first column
mydata_csv = read.csv("mydata.csv")[, -1]
mydata_csv
## serial age family income land
## 1 1 43.31857 2 2759.327 urban
## 2 2 44.30947 5 4017.901 rural
## 3 3 49.67612 1 3516.885 rural
## 4 4 45.21153 1 3840.730 urban
## 5 5 45.38786 2 1002.499 rural
## 6 6 50.14519 3 2901.266 rural
## 7 7 46.38275 4 1880.476 urban
## 8 8 41.20482 5 2519.266 urban
## 9 9 42.93944 5 3451.084 rural
## 10 10 43.66301 3 2407.192 urban
## 11 11 48.67225 1 1444.542 urban
## 12 12 46.07944 2 1974.478 urban
## 13 13 46.20231 5 3672.222 rural
## 14 14 45.33205 5 2670.587 rural
## 15 15 43.33248 4 4152.783 urban
## 16 16 50.36074 5 1411.459 rural
## 17 17 46.49355 2 2739.571 rural
## 18 18 39.10015 1 4939.828 urban
## 19 19 47.10407 1 4572.204 urban
## 20 20 43.58163 3 4545.876 urban
# Read the dataset from the .txt file
mydata_txt = read.table("mydata.txt", header = TRUE, sep = "\t")
mydata_txt
## serial.age.family.income.land
## 1 1 1 43.3185730603434 2 2759.32675041258 urban
## 2 2 2 44.3094675315502 5 4017.90063455701 rural
## 3 3 3 49.6761249424474 1 3516.88452623785 rural
## 4 4 4 45.2115251742737 1 3840.72960540652 urban
## 5 5 5 45.3878632054828 2 1002.49909330159 rural
## 6 6 6 50.1451949606498 3 2901.26629639417 rural
## 7 7 7 46.3827486179676 4 1880.47554064542 urban
## 8 8 8 41.2048162961804 5 2519.26615089178 urban
## 9 9 9 42.9394414443194 5 3451.08401309699 rural
## 10 10 10 43.6630140897001 3 2407.19163697213 urban
## 11 11 11 48.6722453923184 1 1444.54169739038 urban
## 12 12 12 46.0794414811721 2 1974.47789087892 urban
## 13 13 13 46.2023143517822 5 3672.22234979272 rural
## 14 14 14 45.3320481478354 5 2670.58711871505 rural
## 15 15 15 43.3324765957378 4 4152.78333611786 urban
## 16 16 16 50.3607394104092 5 1411.45857702941 rural
## 17 17 17 46.4935514346877 2 2739.57096599042 rural
## 18 18 18 39.1001485301111 1 4939.82791993767 urban
## 19 19 19 47.1040677046911 1 4572.20445759594 urban
## 20 20 20 43.5816257768162 3 4545.87624315172 urban
# Read the dataset from the .xlsx file
mydata_xlsx = read.xlsx("mydata.xlsx")
# You can specify the sheet name and data range in the read_xlsx() function.
# read_xlsx() function is available in the readxl package.
# You can install and load one or several packages at once using the pacman::p_load() function.
# Only once you need to install the pacman package using install.packages("pacman") command.
pacman::p_load(readxl) # Install (if needed) load the readxl package in one command
# Now read the excel file with data range and sheet name
mydata_xlsx = read_xlsx("mydata.xlsx", sheet = "Sheet 1", range = "A1:E20")
# ``, '', "", [ row , column ], $, ~ are used in R.
# ``, '', "" are used for quoting.
# [ row , column ] is used for indexing.
mydata_xlsx[1:2, 2] # First and second row, second column
## # A tibble: 2 × 1
## age
## <dbl>
## 1 43.3
## 2 44.3
# $ is used to access the columns of a dataset.
mydata_xlsx$age # Access the age column
## [1] 43.31857 44.30947 49.67612 45.21153 45.38786 50.14519 46.38275 41.20482
## [9] 42.93944 43.66301 48.67225 46.07944 46.20231 45.33205 43.33248 50.36074
## [17] 46.49355 39.10015 47.10407
mydata_xlsx$family[3:5] # Access the family column from the 3rd to 5th row
## [1] 1 1 2
mydata_xlsx[-c(2, 6), -2] # Access dataset except 2nd and 6th rows, and 2nd column
## # A tibble: 17 × 4
## serial family income land
## <dbl> <dbl> <dbl> <chr>
## 1 1 2 2759. urban
## 2 3 1 3517. rural
## 3 4 1 3841. urban
## 4 5 2 1002. rural
## 5 7 4 1880. urban
## 6 8 5 2519. urban
## 7 9 5 3451. rural
## 8 10 3 2407. urban
## 9 11 1 1445. urban
## 10 12 2 1974. urban
## 11 13 5 3672. rural
## 12 14 5 2671. rural
## 13 15 4 4153. urban
## 14 16 5 1411. rural
## 15 17 2 2740. rural
## 16 18 1 4940. urban
## 17 19 1 4572. urban
# ~ is used in formulas.
# For example, y ~ x means y is modeled as a function of x.
boxplot(age ~ family, data = mydata_xlsx) # Boxplot of age by family

pacman::p_load(car) # qqPlot() function is in the car package
qqPlot(age ~ family, data = mydata_xlsx)

# Most common error source
# Spellings: R is case-sensitive, so make sure to use the correct spellings.
# Brackets: Make sure to open and close the brackets properly.
# Quotes: Use the correct quotes (' or ").
# Directory: Make sure to set the correct working directory.
# Packages: Make sure to install and load the required packages.
# Functions: Make sure to use the correct functions and arguments.
# Run the previous related commands that define the objects in the current line.
# Data columns stacking and unstacking
# Stacking: Combining multiple columns into a single column.
# Unstacking: Splitting a single column into multiple columns.
# We will use tidyverse package for data manipulation.
pacman::p_load(tidyverse)
# Use select to select variables and filter to filter rows.
# We will use pipe operator %>% (Ctrl + Shift + M) to combine multiple operations.
# Left side of the %>% will be the input to the right side function.
mydata_xlsx %>%
select(age, family, land) %>%
filter(age >40, family %in% c(1, 2)) # Select serial, age, family columns and filter age > 45 and family 1 or 2
## # A tibble: 8 × 3
## age family land
## <dbl> <dbl> <chr>
## 1 43.3 2 urban
## 2 49.7 1 rural
## 3 45.2 1 urban
## 4 45.4 2 rural
## 5 48.7 1 urban
## 6 46.1 2 urban
## 7 46.5 2 rural
## 8 47.1 1 urban
# Stack the age and family columns into one column named merged
long = mydata_xlsx %>%
pivot_longer(cols = c(age, family) , names_to = 'merged', values_to = 'values') # Combine age and family columns into merged column
# Unstack the merged column into age and family columns
short = long %>%
pivot_wider(names_from = merged, values_from = values) # Split merged column into age and family columns
# Create new variables (income_per_member) using mutate() function
short = short %>%
mutate(income_per_member = income/family) # Create new variable income_per_member
# Create new variable (status: poor < 1000 income_per_member) using conditional statement
short = short %>%
mutate(status = ifelse(income_per_member < 1000, "poor", "rich")) # Create new variable status
# Create new variable (status1: poor < 1000 income_per_member) using case_when() function
short = short %>%
mutate(status1 = case_when(
income_per_member < 1000 ~ "poor",
income_per_member >= 1000 ~ "rich"
)) # Create new variable status1
# Create new variable (status2: poor < 1000, middle 1000-2000, rich > 2000 income_per_member) using ifelse() function and dollar sign
short$status2 = ifelse(short$income_per_member < 1000, "poor",
ifelse(short$income_per_member <= 2000, "middle", "rich")) # Create new variable status2
# Cut the variable (income_per_member) into intervals (poor < 1000, middle 1000-2000, rich > 2000) using cut() function and save as status3
# right = FALSE means the intervals are left-closed
short$status3 = cut(short$income_per_member, right = FALSE,
breaks = c(-Inf, 1000, 2000-0.001, Inf),
labels = c("poor", "middle", "rich")) # Create new variable status3
# Create a frequency table of status3
table(short$status3) # Frequency table of status3
##
## poor middle rich
## 11 4 4
# Create a cross-tabulation of status3 and land
table(short$status3, short$land) # Cross-tabulation of status3 and land
##
## rural urban
## poor 7 4
## middle 1 3
## rich 1 3
# Use mydata_xlsx dataset for descriptive statistics
# Mean, SD, Median, Min, Max, Range, IQR, Summary functions
# Select a variable using $ sign and apply function
mean(mydata_xlsx$income) # Mean of income_per_member
## [1] 2940.753
sd(mydata_xlsx$income) # Standard deviation of income_per_member
## [1] 1111.229
median(mydata_xlsx$income) # Median of income_per_member
## [1] 2759.327
min(mydata_xlsx$income) # Minimum of income_per_member
## [1] 1002.499
max(mydata_xlsx$income) # Maximum of income_per_member
## [1] 4939.828
range(mydata_xlsx$income) # Range of income_per_member
## [1] 1002.499 4939.828
IQR(mydata_xlsx$income) # Interquartile range of income_per_member
## [1] 1565.641
summary(mydata_xlsx$income) # Summary of income_per_member
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1002 2191 2759 2941 3756 4940
boxplot(mydata_xlsx$income) # Boxplot of income_per_member

# IQR=Q3−Q1, Q2 = Median
# Boxplot: Q1, Q2, Q3, Whiskers, Outliers
# The lower whisker extends from Q1 to the minimum value within 1.5 × IQR below Q1.
# The upper whisker extends from Q3 to the maximum value within 1.5 × IQR above Q3.
# Outliers are the values beyond the whiskers.
# se and cv: Standard error and coefficient of variation
# se = sd/sqrt(n), cv = sd/mean
se = sd(mydata_xlsx$income)/sqrt(length(mydata_xlsx$income)) # Standard error of income_per_member
cv = sd(mydata_xlsx$income)/mean(mydata_xlsx$income) # Coefficient of variation of income_per_member
# Custom function: Create a function to calculate the standard error
# Function name is followed by parentheses and curly brackets.
se = function(x) {
return(sd(x)/sqrt(length(x)))
}
se(mydata_xlsx$income) # Standard error of income_per_member
## [1] 254.9334
# apply function
# apply() function is used to apply a function to the rows or columns of a matrix or data frame.
head(mydata_xlsx)
## # A tibble: 6 × 5
## serial age family income land
## <dbl> <dbl> <dbl> <dbl> <chr>
## 1 1 43.3 2 2759. urban
## 2 2 44.3 5 4018. rural
## 3 3 49.7 1 3517. rural
## 4 4 45.2 1 3841. urban
## 5 5 45.4 2 1002. rural
## 6 6 50.1 3 2901. rural
# Calculate the mean of each column of age, family and income
apply(mydata_xlsx[, c("age", "family", "income")], 2, mean) # Apply the mean function to each column
## age family income
## 45.52188 3.00000 2940.75256
# Divide each row's income by family members and save it as income_per_member
mydata_xlsx$income_per_member = apply(mydata_xlsx[, c("income", "family")], 1, function(x) x[1]/x[2]) # Apply the function to each row
# lapply and sapply functions
# lapply() function is used to apply a function to each element of a list or vector.
# sapply() function is used to simplify the output of lapply() function.
lapply(mydata_xlsx[, c("age", "family", "income")], mean) # Apply the mean function to each column
## $age
## [1] 45.52188
##
## $family
## [1] 3
##
## $income
## [1] 2940.753
sapply(mydata_xlsx[, c("age", "family", "income")], mean) # Simplify the output
## age family income
## 45.52188 3.00000 2940.75256
# tapply function
# tapply() function is used to apply a function to subsets of a vector or data frame.
# The first argument is the vector or data frame, the second argument is the factor variable, and the third argument is the function.
tapply(mydata_xlsx$income, mydata_xlsx$land, mean) # Apply the mean function to income by land
## rural urban
## 2820.386 3049.082
# aggregate function to calculate the mean of income by land
aggregate(income ~ land, data = mydata_xlsx, mean) # Aggregate the mean of income by land
## land income
## 1 rural 2820.386
## 2 urban 3049.082
# use summarise function to calculate the mean of income by land
mydata_xlsx %>%
group_by(land) %>%
summarise(mean_income = mean(income)) # Summarise the mean of income by land
## # A tibble: 2 × 2
## land mean_income
## <chr> <dbl>
## 1 rural 2820.
## 2 urban 3049.
# Data structure
str(mydata_xlsx)
## tibble [19 × 6] (S3: tbl_df/tbl/data.frame)
## $ serial : num [1:19] 1 2 3 4 5 6 7 8 9 10 ...
## $ age : num [1:19] 43.3 44.3 49.7 45.2 45.4 ...
## $ family : num [1:19] 2 5 1 1 2 3 4 5 5 3 ...
## $ income : num [1:19] 2759 4018 3517 3841 1002 ...
## $ land : chr [1:19] "urban" "rural" "rural" "urban" ...
## $ income_per_member: num [1:19] 1380 804 3517 3841 501 ...
# Change the datatype land from character to factor
mydata_xlsx$land = as.factor(mydata_xlsx$land)
# Change the datatype family and land to factor in one command
factor_vars = c("family", "land")
mydata_xlsx[factor_vars] = lapply(mydata_xlsx[factor_vars], as.factor)
# Demonstrate CLT with this dataset
# Central Limit Theorem (CLT) states that the sampling distribution of the sample mean approaches a normal distribution as the sample size increases (n>=30).
# The mean of the sampling distribution is equal to the population mean.
# 68% of the sample means fall within one standard error of the population mean.
# 95% of the sample means fall within 1.96 standard errors of the population mean.
# 99% of the sample means fall within 2.58 standard errors of the population mean.
# Create a sampling distribution of the sample mean
# Set the seed for reproducibility
set.seed(123)
n = 30 # Sample size
B = 1000 # Number of samples
sample_means = replicate(B, mean(sample(mydata_xlsx$income, n, replace = TRUE))) # Generate B sample means
# Calculate the mean and standard deviation of the sample means
(mean_sample_means = mean(sample_means)) # Mean of the sample means
## [1] 2948.668
(sd_sample_means = sd(sample_means)) # Standard deviation of the sample means
## [1] 196.1562
(se_sample_means = sd_sample_means) # Standard error of the sample means
## [1] 196.1562
# Plot the sampling distribution of the sample mean
hist(sample_means, breaks = 30, col = "skyblue", main = "Sampling Distribution of the Sample Mean", xlab = "Sample Mean")

{ # use second bracket to combine multiline codes
plot(density(sample_means), col = "blue", main = "Sampling Distribution of the Sample Mean", xlab = "x-axis => Sample Mean or z-values \n Probability = Area under the curve.", ylab = "Density or relative frequency", lty = 1, lwd = 2)
abline(v = mean_sample_means, col = "red", lty = 2, lwd = 2) # Add a vertical line for the mean of the sample means
abline(v = mean_sample_means + c(-1, 1)*se_sample_means*1.96, col = "black", lty = 1, lwd = 2) # Add vertical lines for the mean ± 1.96 SE
text(3000, 0.0005, paste('mean =', round(mean_sample_means)), pos = 4, col = "red", srt = 90) # Add a text for the mean of the sample means
text(mean_sample_means, 0, '0', pos = 3, col = "black")
text(3400, 0.0005, 'mean+1.96se', pos = 4, col = "black", srt=90)
text(3200, 0.0000, '+1.96', pos = 4, col = "black", srt=0)
text(2400, 0.0005, 'mean-1.96se', pos = 4, col = "black", srt=90)
text(2400, 0.0000, '-1.96', pos = 4, col = "black", srt=0)
text(2900, 0.0020, '95% CI [2557, 3341]', pos = 1, col = "black", srt=0)
}

# Now sort the mydata_xlsx$income dataset by income in ascending order
(income_ascending = sort(mydata_xlsx$income, decreasing = FALSE))
## [1] 1002.499 1411.459 1444.542 1880.476 1974.478 2407.192 2519.266 2670.587
## [9] 2739.571 2759.327 2901.266 3451.084 3516.885 3672.222 3840.730 4017.901
## [17] 4152.783 4572.204 4939.828
# Find the 2.5th and 97.5th percentiles of the income dataset
(quantile(income_ascending, c(0.025, 0.975)))
## 2.5% 97.5%
## 1186.531 4774.397
# 95% CI [1186.531, 4774.397]
# Calculate CI from the sample mean
mean_income = mean(mydata_xlsx$income)
sd_income = sd(mydata_xlsx$income)
se_income = se(mydata_xlsx$income)
c((mean_income - 1.96*se_income),(mean_income + 1.96*se_income))
## [1] 2441.083 3440.422
# 95% CI [2441.083, 3440.422]
# From the plot, 95% CI [2557, 3341] and 95% CI [2441.083, 3440.422] are close to each other. These are based on distribution based.
# If the sample is not normally distributed, the CI based on the distribution may not be accurate.
# Therefore, we need to check the normality of a variable or residuals when it is necessary, otherwise the test statistics will not be reliable.
# Conditional statement
# if, else if, else
# if (condition) {statement} else {statement}
# if (condition) {statement} else if (condition) {statement} else {statement}
if (mean(mydata_xlsx$income) > 3000) {
print("The mean income is greater than 3000.")
} else {
print("The mean income is less than or equal to 3000.")
}
## [1] "The mean income is less than or equal to 3000."
if (mean(mydata_xlsx$income) > 4000) {
print("The mean income is greater than 4000.")
} else if (mean(mydata_xlsx$income) > 3000) {
print("The mean income is greater than 3000 but less than or equal to 4000.")
} else {
print("The mean income is less than or equal to 3000.")
}
## [1] "The mean income is less than or equal to 3000."
# Loop function
# for loop, while loop
# for (variable in sequence) {statement}
# while (condition) {statement}
for (i in 1:5) {
print(i)
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
while (i <= 10) {
print(i)
i = i + 1
}
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
# Iterate the mean function over three columns of the dataset and same the means in means object
means = c() # blank vector for storing calculated means
for (column in c("age", "family", "income")) {
means = c(means, mean(mydata_xlsx[[column]]))
}
## Warning in mean.default(mydata_xlsx[[column]]): argument is not numeric or
## logical: returning NA
means
## [1] 45.52188 NA 2940.75256
# Look the format of the column indexing.
# mydata_xlsx[[column]] is used for indexing the columns of the dataset.
# Here, column = 1, in the first iteration to calculate the mean of age
# column = 2, in the second iteration to calculate the mean of family, and so on.
# Just for your checking
mydata_xlsx[[1]] # First column of the dataset
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
mydata_xlsx['age'] # This contains variable name and values
## # A tibble: 19 × 1
## age
## <dbl>
## 1 43.3
## 2 44.3
## 3 49.7
## 4 45.2
## 5 45.4
## 6 50.1
## 7 46.4
## 8 41.2
## 9 42.9
## 10 43.7
## 11 48.7
## 12 46.1
## 13 46.2
## 14 45.3
## 15 43.3
## 16 50.4
## 17 46.5
## 18 39.1
## 19 47.1
mydata_xlsx[['age']] # This contains only values
## [1] 43.31857 44.30947 49.67612 45.21153 45.38786 50.14519 46.38275 41.20482
## [9] 42.93944 43.66301 48.67225 46.07944 46.20231 45.33205 43.33248 50.36074
## [17] 46.49355 39.10015 47.10407
# Data visualization: ggplot2 package
# ggplot2 is a powerful package for creating graphics in R.
# It is based on the grammar of graphics, which is a structured way (layer by layer) of creating graphics.
# Scattered plot of x=age and y=income
pacman::p_load(jtools)
ggplot(mydata_xlsx, aes(x = age, y = income)) +
geom_point() + # Scatter plot of age and income
labs(title = "Scatter plot of Age and Income", x = "Age", y = "Income") + # Add title and axis labels
theme_apa()

# Boxplot of income by land
ggplot(mydata_xlsx, aes(x = land, y = income)) +
geom_boxplot() + # Boxplot of income by land
labs(title = "Boxplot of Income by Land", x = "Land", y = "Income") + # Add title and axis labels
theme_apa()

# Histogram of income
ggplot(mydata_xlsx, aes(x = income)) +
geom_histogram(binwidth = 500, fill = "skyblue", color = "black") + # Histogram of income
labs(title = "Histogram of Income", x = "Income", y = "Frequency") + # Add title and axis labels
theme_apa()

# Density plot of income by land
ggplot(mydata_xlsx, aes(x = income, fill = land)) +
geom_density(alpha = 0.5) + # Density plot of income by land
labs(title = "Density Plot of Income by Land", x = "Income", y = "Density") + # Add title and axis labels
theme_apa()

# Bar plot of family by land
ggplot(mydata_xlsx, aes(x = family, fill = land)) +
geom_bar(position = "dodge") + # Bar plot of family by land
labs(title = "Bar Plot of Family by Land", x = "Family", y = "Count") + # Add title and axis labels
theme_apa()

# Scatter plot of income by age
ggplot(mydata_xlsx, aes(x = age, y = income, group = 1)) +
geom_point(color = "tomato") +
labs(title = "Line Plot of Income by Age", x = "Age", y = "Income") + # Add title and axis labels
theme_apa() +
geom_smooth(method = "lm", se = TRUE, formula = y ~ x) # Add a linear regression line

# Pie chart of family labelled by percentage
# Calculate counts and proportions for each family
pie_data = mydata_xlsx %>%
group_by(family) %>%
summarise(count = n()) %>%
mutate(percentage = 100 * count / sum(count))
# Create the pie chart
ggplot(pie_data, aes(x = "", y = count, fill = family)) +
geom_bar(stat = "identity", width = 1) + # Bar plot with counts
coord_polar("y", start = 0) + # Convert to pie chart
geom_text(aes(label = paste0(round(percentage,1), '%') ),
position = position_stack(vjust = 0.5)) + # Add percentage labels
labs(title = "Pie Chart of Family", fill = "Family") + # Add title and legend
theme_void() + # Remove background and axes
theme(legend.position = "bottom",
plot.title = element_text(hjust = 0.5)) # Remove the legend

# Spider plot or radar chart
# devtools::install_github("ricardo-bion/ggradar")
pacman::p_load(ggradar, scales, patchwork)
# Create a dataset for the spider plot
set.seed(1)
spider_data = - as.data.frame(
matrix(sample(1:10 , 96 , replace = TRUE),
ncol=8, byrow = TRUE))
# Change the column names
colnames(spider_data) = c( "Dhaka", "Khulna", "Barishal", "Chittagong", "Rangpur",
"Rajshahi", "Sylhet", "Cumilla")
# Set the row names
rownames(spider_data) <- month.abb[1:12]
spider_data
## Dhaka Khulna Barishal Chittagong Rangpur Rajshahi Sylhet Cumilla
## Jan -9 -4 -7 -1 -2 -7 -2 -3
## Feb -1 -5 -5 -10 -6 -10 -7 -9
## Mar -5 -5 -9 -9 -5 -5 -2 -10
## Apr -9 -1 -4 -3 -6 -10 -10 -6
## May -4 -4 -10 -9 -7 -6 -9 -8
## Jun -9 -7 -8 -6 -10 -7 -3 -10
## Jul -6 -8 -2 -2 -6 -6 -1 -3
## Aug -3 -8 -6 -7 -6 -8 -7 -1
## Sep -4 -8 -9 -9 -7 -4 -7 -6
## Oct -1 -5 -6 -1 -9 -7 -7 -3
## Nov -6 -2 -10 -10 -7 -3 -2 -10
## Dec -1 -10 -10 -8 -10 -5 -7 -8
# Scale the data between 0 and 1 and keep two decimal places
scaled_data <- round(apply(spider_data, 2, scales::rescale), 2)
plot_data <- as.data.frame(scaled_data)
plot_data
## Dhaka Khulna Barishal Chittagong Rangpur Rajshahi Sylhet Cumilla
## Jan 0.00 0.67 0.38 1.00 1.00 0.43 0.89 0.78
## Feb 1.00 0.56 0.62 0.00 0.50 0.00 0.33 0.11
## Mar 0.50 0.56 0.12 0.11 0.62 0.71 0.89 0.00
## Apr 0.00 1.00 0.75 0.78 0.50 0.00 0.00 0.44
## May 0.62 0.67 0.00 0.11 0.38 0.57 0.11 0.22
## Jun 0.00 0.33 0.25 0.44 0.00 0.43 0.78 0.00
## Jul 0.38 0.22 1.00 0.89 0.50 0.57 1.00 0.78
## Aug 0.75 0.22 0.50 0.33 0.50 0.29 0.33 1.00
## Sep 0.62 0.22 0.12 0.11 0.38 0.86 0.33 0.44
## Oct 1.00 0.56 0.50 1.00 0.12 0.43 0.33 0.78
## Nov 0.38 0.89 0.00 0.00 0.38 1.00 0.89 0.00
## Dec 1.00 0.00 0.00 0.22 0.00 0.71 0.33 0.22
# Create the radar plot for the first row of the dataset
ggradar(plot_data[1, ],
values.radar = c('0%', '50%', '100%'),
grid.min = 0, grid.mid = 0.5, grid.max = 1,
legend.position = "top")

# Create the radar plot for two rows of the dataset
ggradar(plot_data[2:3,],
values.radar = c('0%', '50%', '100%'),
grid.min = 0, grid.mid = 0.5, grid.max = 1,
legend.position = "top")

# Create the radar plot for all rows of the dataset using a loop
# Create an empty list to store the plots
{radar_plots = list()
for (i in 1:nrow(plot_data)) {
radar_plot = ggradar(plot_data[i,],
values.radar = c('0%', '50%', '100%'),
grid.min = 0, grid.mid = 0.5, grid.max = 1,
plot.title = row.names(plot_data)[i],
legend.position = "top",
group.point.size = 2,
group.line.width = 1,
axis.label.size = 2,
grid.label.size = 2,
base.size = 12) +
theme(plot.title = element_text(size = 9, hjust = 0.5))
radar_plots[[i]] = radar_plot # Store the radar plot in the list
}
wrap_plots(radar_plots, ncol = 3) # Combine the radar plots into a single plot
}

# Save the plot as a .png file
ggsave("spider_plot.png", dpi = 300, width = 6, height = 8, units = 'in')
# Additional bivariate and [multivariate plots can be found] (https://rpubs.com/Professor Dr. Md. Kamrul Hasanlext/correlation)