R Programming

R Markdown - Getting Started

This document was produced by Celestin Niyomugabo (celestin@vonsung.co.rw).
Visit www.CelestinNiyomugabo.com for booking arrangements.

Note: Some of the data used in this document are dummy or synthetic and can not be cited for reference. They are only used for educational purpose.

Working with vector, list, and matrices

The vector can be created using c and can be assigned to a name using equal assignment = or leftward assignment <-. The scripts below shows how to create:

vector: a vector of integer/numbers
bolean: a vector of logic elements (true or false)
employees_names: a vector of string (here we create a vector of employees name)
employees_age: a vector of employees age (numeric values)
employees_sex: a vector of employees sex

vector <- c(12,34,56,78,97)

bolean <- c(TRUE, FALSE)
employees_names = c("Joe", "Doe", "Claire", "Peter", "Tom")
employees_age = c(23,43,25,31,48)
employees_salary = c(12000,23500,42100,34500,69200)
employee_sex = c("Male", "Male", "Female", "Male", "Female")
new_age = employees_age+3

# A vector od sequence of numbers, starting from 1 to 5
serial_number = c(1:5)

# A vector of complex numbers
complex_vector = c(1+0i, 1+2i)
print(complex_vector)

[1] 1+0i 1+2i

# The vectors (also known as column) can be combined using cbind()
employees = cbind(employees_names, employees_age, employees_salary, employee_sex)
print(employees)

     employees_names employees_age employees_salary employee_sex
[1,] "Joe"           "23"          "12000"          "Male"      
[2,] "Doe"           "43"          "23500"          "Male"      
[3,] "Claire"        "25"          "42100"          "Female"    
[4,] "Peter"         "31"          "34500"          "Male"      
[5,] "Tom"           "48"          "69200"          "Female"

# Matrix
new_matrix_1 = matrix(1:20, nrow = 4)
new_matrix_2 = matrix(4:11, ncol = 4)

# Matrix operation
matrix_product = new_matrix_2 %*% new_matrix_1
print(matrix_product)

     [,1] [,2] [,3] [,4] [,5]
[1,]   80  192  304  416  528
[2,]   90  218  346  474  602

# Creating a list
new_list = list(employees_names, serial_number, matrix_product, bolean)
new_list

[[1]]
[1] "Joe"    "Doe"    "Claire" "Peter"  "Tom"   

[[2]]
[1] 1 2 3 4 5

[[3]]
     [,1] [,2] [,3] [,4] [,5]
[1,]   80  192  304  416  528
[2,]   90  218  346  474  602

[[4]]
[1]  TRUE FALSE

# A vector with missing value
with_missing = c(12,34,55,23, NA, 13, NA, 23)
na_values = is.na(with_missing)
filter_data = with_missing[!na_values]
filter_data

[1] 12 34 55 23 13 23

double_filter = complete.cases(employees_names, employees_salary)

Data frame

To create a data frame, you cane use data.frame() or use cbind

# DataFrame
employees_data = as.data.frame(employees)
employees_data = data.frame(employees_names, employees_age, employees_salary, employee_sex)
employees_data

  employees_names employees_age employees_salary employee_sex
1             Joe            23            12000         Male
2             Doe            43            23500         Male
3          Claire            25            42100       Female
4           Peter            31            34500         Male
5             Tom            48            69200       Female

Summary statistics

You can use summary() command to get the summary statistics of a given variable. This includes the minimum and maximum, mean, median, as long as quartiles.

mean(employees_data$employees_salary) # Mean for the column employees_salary

[1] 36260

summary(employees_data$employees_age) # Summary statistics for employees_age

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
     23      25      31      34      43      48

summary(employees_data) # Mean for the entire dataframe

 employees_names    employees_age employees_salary employee_sex      
 Length:5           Min.   :23    Min.   :12000    Length:5          
 Class :character   1st Qu.:25    1st Qu.:23500    Class :character  
 Mode  :character   Median :31    Median :34500    Mode  :character  
                    Mean   :34    Mean   :36260                      
                    3rd Qu.:43    3rd Qu.:42100                      
                    Max.   :48    Max.   :69200

The data frame can also be created by using data editor in R

dataset_1 = data.frame(height=numeric(0), weight=numeric(0))
# dataset_1 = edit(dataset_1)

Reading data from external source

R can read data from external files such as CSV, text files, stata files, etc. The following script shows the data imported from a CSV and a text file. The names used when creating the text file were sourced from this link: https://1000randomnames.com/ - just for practice purpose only. To start working with files, it’s better to first set the working directory

I also imported a Stata (.dta) file after installing haven package.

# Set working directory
directory_of_current_file = dirname(file.path(getwd(), "Practice.Rmd")) # The file is currently saved at "/Users/vonsung/ownCloud - Celestin Niyomugabo@cloud.vonsung.io/AUCA - Big Data/R for Data Science/" from my computer
setwd(directory_of_current_file) # Setting current wd do where the file Paractice 1.Rmd is saved


csv_data = read.csv(file="datasets/sample data.csv", header= T)
text_dataset = read.table(file="datasets/sample data.txt", header = T, sep = "\t")

#install.packages("haven")
library("haven")
stata_dataset = read_dta(file="datasets/stata dataset.dta")

colname() can be used to view the names of the columns (variables) for the data imported.
Alternatively, you can also use head() to preview some rows of the dataframe.
Use View() to preview the entire dataframe
Use length() to get the number of vqriables within the dataframe

colnames(text_dataset)

[1] "Name"   "Age"    "Height" "Weight" "Class"

head(stata_dataset)

# A tibble: 6 × 100
  consent   interview_date province    district sector     type    sex     dob  
  <dbl+lbl> <chr>          <dbl+lbl>   <dbl+lb> <dbl+lbl>  <dbl+l> <dbl+l> <chr>
1 1 [Yes]   2023-05-13     4 [Norther… 45 [Rul… 4517 [Tum… 1 [Ind… 1 [Mal… 1959…
2 1 [Yes]   2023-05-14     4 [Norther… 45 [Rul… 4504 [Buy… 1 [Ind… 1 [Mal… 1959…
3 1 [Yes]   2023-05-14     4 [Norther… 45 [Rul… 4504 [Buy… 1 [Ind… 1 [Mal… 1983…
4 1 [Yes]   2023-05-14     4 [Norther… 45 [Rul… 4517 [Tum… 1 [Ind… 1 [Mal… 1964…
5 1 [Yes]   2023-05-14     4 [Norther… 45 [Rul… 4517 [Tum… 1 [Ind… 2 [Fem… 1964…
6 1 [Yes]   2023-05-14     4 [Norther… 45 [Rul… 4517 [Tum… 1 [Ind… 2 [Fem… 1983…
# ℹ 92 more variables: age <dbl>, education <dbl+lbl>, hh_size <dbl+lbl>,
#   income1 <dbl+lbl>, income2 <dbl+lbl>, tea_area <dbl+lbl>,
#   tea_income <dbl+lbl>, formal_credit <dbl+lbl>,
#   formal_credit_count <dbl+lbl>, lender_institution_type <dbl+lbl>,
#   other_lender_institution_type <chr>, formal_credit_size <dbl+lbl>,
#   formal_payback_period <dbl+lbl>, formal_interest_rate <dbl>,
#   formal_defaulted <dbl+lbl>, formal_defaulted_type <dbl+lbl>, …

#View(stata_dataset)
length(text_dataset)

[1] 5

Plots

1. Histogram plot (used for discret nueric data)

Use hist() to plot a histogram. main='' is used to define the plot header/title,
xlab='' and ylab='' define horizontal and vertical axes respectively,
col='' is used to define the color of histogram bars. The color can either be defined using the color name (Eg: red, blue, green, orange,..) or RGB HEX code (eg: #FFFFFF for white, #00FF00 for green, etc…).
The border attribute is used to customize the border color,
yaxt = 'n' and xaxt = 'n' are used to hide y and x axes respectively. To hide all axes, use axes = F
I added text() function to display the frequencies per each age category

histogram = hist(text_dataset$Age, main="Histogram plot for age", xlab = "Age", ylab = "", col = "#aff218", border="#000000", freq = T, axes = T, yaxt = "n")

text(histogram$mids, histogram$counts, labels = histogram$counts, pos = 3, col = "black")

2.Stacked diagram and side diagram

Both stack diagram and side diagram can be plotted using barplot() with the following arguments:

You first need to provide data in a matrix form,
Specify whether you need a stacked diagram by setting beside='' argument to False,
If you want a side by side bar plot, set beside='' to True (set it to T or True)
The col='' need to be specified in a form of a vector since we need to specify the color for each category
names.arg = '' specify the grouping category/variable.

In the example below, a dataframe for different tax types was first created (Corporate Income Tax and Personal Income Tax) in different countries; then the bar plot was used.

# below we crete albirtary data for tax collections by country
cit = c(12.3, 14.2, 10.9, 15.8)
pit = c(2.5, 4.1, 2.9, 5.2)
countries = c("Rwanda", "Tanzania", "Burundi", "Kenya")
tax_data = data.frame(Country = countries, CIT = cit, PIT = pit) # A dataframe for taxes collected
tax_data

   Country  CIT PIT
1   Rwanda 12.3 2.5
2 Tanzania 14.2 4.1
3  Burundi 10.9 2.9
4    Kenya 15.8 5.2

barplot(rbind(tax_data$CIT, tax_data$PIT), beside = T, col = c("green", "gold"), names.arg = tax_data$Country, main = "CIT vs PIT collection in 2023 (in Bn USD)")
legend("topright", legend = c("CIT", "PIT"), fill = c("green", "gold"), cex = .6)

The following script will plot a stacked diagram since the beside='' attribute was set to false:

barplot(rbind(tax_data$CIT, tax_data$PIT), beside = F, col = c("green", "gold"), names.arg = tax_data$Country, main = "CIT vs PIT collection in 2023 (in Bn USD)")
legend("topleft", legend = c("CIT", "PIT"), fill = c("green", "gold"), cex = .6)

In case you have a long dataset that you need to plot summarized/grouped statistics, you first need to group your data.

# Bar graph data from a dataframe
bar_data = aggregate(cbind(Height, Weight) ~ Class, data = text_dataset, function(x) round(mean(x), 1))


barplot_var = barplot(rbind(bar_data$Height, bar_data$Weight), beside = TRUE, col = c("#abf254", "#23af10"), names.arg = bar_data$Class, main = "Average height and weight for students", xlab = "", ylab = "Height/Weight average")


# Add legend
legend("topright", legend = c("Height", "Weight"), fill = c("#abf254", "#23af10"), cex = .6)

3. Pie chart

A pie chart can be used to present the contribution to the total

pie(tax_data$CIT, labels = tax_data$Country, main = "CIT Distribution", col = rainbow(length(tax_data$CIT)))

4. Scatter plot

A scatter plot is used to show the relationship between 2 continuous variables. In the example below, the height was plotted against weight of the respondent in our imported dataset.

plot(text_dataset$Height, text_dataset$Weight, main = "Height VS Weight of students", xlab = "Height (in cm)", ylab = "Weight (in kg)", col = "blue", pch=4) # pch attribute is used to define the dot type

5. Box plot

The box plot is used to test identify the outlier.

boxplot(text_dataset$Height~text_dataset$Class, main="Box plot for student height", xlab = "", ylab = "Height")

Using R built-in data

R installation come with some dataset deemed for practice purpose. Use data() to print a list of the datasets that come with R. To access a particular dataset (airquality for instance), use dataframe(airquality)

#data()
airquality_data = data.frame(airquality)
head(airquality_data)

  Ozone Solar.R Wind Temp Month Day
1    41     190  7.4   67     5   1
2    36     118  8.0   72     5   2
3    12     149 12.6   74     5   3
4    18     313 11.5   62     5   4
5    NA      NA 14.3   56     5   5
6    28      NA 14.9   66     5   6

summary(airquality)

     Ozone           Solar.R           Wind             Temp      
 Min.   :  1.00   Min.   :  7.0   Min.   : 1.700   Min.   :56.00  
 1st Qu.: 18.00   1st Qu.:115.8   1st Qu.: 7.400   1st Qu.:72.00  
 Median : 31.50   Median :205.0   Median : 9.700   Median :79.00  
 Mean   : 42.13   Mean   :185.9   Mean   : 9.958   Mean   :77.88  
 3rd Qu.: 63.25   3rd Qu.:258.8   3rd Qu.:11.500   3rd Qu.:85.00  
 Max.   :168.00   Max.   :334.0   Max.   :20.700   Max.   :97.00  
 NA's   :37       NA's   :7                                       
     Month            Day      
 Min.   :5.000   Min.   : 1.0  
 1st Qu.:6.000   1st Qu.: 8.0  
 Median :7.000   Median :16.0  
 Mean   :6.993   Mean   :15.8  
 3rd Qu.:8.000   3rd Qu.:23.0  
 Max.   :9.000   Max.   :31.0

airquality[2:4,4]

[1] 72 74 62

str(airquality) #to get data type

'data.frame':   153 obs. of  6 variables:
 $ Ozone  : int  41 36 12 18 NA 28 23 19 8 NA ...
 $ Solar.R: int  190 118 149 313 NA NA 299 99 19 194 ...
 $ Wind   : num  7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
 $ Temp   : int  67 72 74 62 56 66 65 59 61 69 ...
 $ Month  : int  5 5 5 5 5 5 5 5 5 5 ...
 $ Day    : int  1 2 3 4 5 6 7 8 9 10 ...

summary(pressure)

  temperature     pressure       
 Min.   :  0   Min.   :  0.0002  
 1st Qu.: 90   1st Qu.:  0.1800  
 Median :180   Median :  8.8000  
 Mean   :180   Mean   :124.3367  
 3rd Qu.:270   3rd Qu.:126.5000  
 Max.   :360   Max.   :806.0000

Working with R built-in data

R comes with sample data which can be accessed using data().

variable.names(iris) # List all variable names of iris dataframe

[1] "Sepal.Length" "Sepal.Width"  "Petal.Length" "Petal.Width"  "Species"

# structure(iris) # Get the structure of your data frame
iris[1:10,] #Print first 10 observations

   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1           5.1         3.5          1.4         0.2  setosa
2           4.9         3.0          1.4         0.2  setosa
3           4.7         3.2          1.3         0.2  setosa
4           4.6         3.1          1.5         0.2  setosa
5           5.0         3.6          1.4         0.2  setosa
6           5.4         3.9          1.7         0.4  setosa
7           4.6         3.4          1.4         0.3  setosa
8           5.0         3.4          1.5         0.2  setosa
9           4.4         2.9          1.4         0.2  setosa
10          4.9         3.1          1.5         0.1  setosa

mean(iris$Sepal.Length[iris$Species=="setosa"]) # Get sepal length mean withing setosa specie

[1] 5.006

mean(iris$Petal.Length[iris$Species=="setosa"]) # Get petal length mean withing setosa specie

[1] 1.462

# Get summary statistics by categories
by(iris$Petal.Length, iris$Species, mean)

iris$Species: setosa
[1] 1.462
------------------------------------------------------------ 
iris$Species: versicolor
[1] 4.26
------------------------------------------------------------ 
iris$Species: virginica
[1] 5.552

by(iris$Petal.Length, iris$Species, sd)

iris$Species: setosa
[1] 0.173664
------------------------------------------------------------ 
iris$Species: versicolor
[1] 0.469911
------------------------------------------------------------ 
iris$Species: virginica
[1] 0.5518947

by(iris$Petal.Length, iris$Species, summary)

iris$Species: setosa
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  1.000   1.400   1.500   1.462   1.575   1.900 
------------------------------------------------------------ 
iris$Species: versicolor
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   3.00    4.00    4.35    4.26    4.60    5.10 
------------------------------------------------------------ 
iris$Species: virginica
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  4.500   5.100   5.550   5.552   5.875   6.900

Box plot of built-in data

After accessing R built-in data, you can do some manipulations. The following script plots the boxplot using R built-in data

boxplot(iris$Sepal.Length~iris$Species, data=iris, ylab="Sepal length", xlab="Species")

Data management

1. Working with dates

You can use Sys.Date() which is the system date to get the current date. To format a value as date, use as.Date. The difftime() is used to compute the time difference. The example belows shows how to calculate the number of ellapsed days from 2 date intervals.

today = Sys.Date()
dob = as.Date("2000-01-01")
difftime(today, dob, units = "days")

Time difference of 9354 days

2. Creating a new variable

The following script created a new variable named salary_to_age, which is the ratio of the employee salary divided by his/her age. The function round is used to round the new created variable to one decimal see: digit=1

attach(employees_data)

The following objects are masked _by_ .GlobalEnv:

    employee_sex, employees_age, employees_names, employees_salary

(salary_to_age = round((employees_salary/employees_age),digit=1))

[1]  521.7  546.5 1684.0 1112.9 1441.7

head(cbind(employees_data, salary_to_age))

  employees_names employees_age employees_salary employee_sex salary_to_age
1             Joe            23            12000         Male         521.7
2             Doe            43            23500         Male         546.5
3          Claire            25            42100       Female        1684.0
4           Peter            31            34500         Male        1112.9
5             Tom            48            69200       Female        1441.7

Let’s create a new variable “BMI (weight to height ratio)” based on student height and weight:

BMI = round(text_dataset$Weight/(text_dataset$Height/100), digits = 1) #Height should be in meters
text_dataset = data.frame(text_dataset, BMI)
head(text_dataset)

              Name Age Height Weight   Class  BMI
1   Lillie Galindo  52    148     87 Level 3 58.8
2 Salvatore Warner  17    151     91 Level 4 60.3
3     Wynter Novak  42    154     53 Level 3 34.4
4     Bishop Ramos  68    163     48 Level 3 29.4
5      Alice Cantu  39    142     90 Level 3 63.4
6     Anakin Gomez  26    147     67 Level 4 45.6

tail(text_dataset)

               Name Age Height Weight   Class  BMI
60     Rogelio Wolf  59    150     66 Level 1 44.0
61     Jolene Meyer  33    167     61 Level 1 36.5
62 Tristan Chambers  63    151     89 Level 3 58.9
63   Makayla Spence  54    166     63 Level 3 38.0
64      Cillian Day  55    164     72 Level 2 43.9
65     Hayden Bryan  72    155     62 Level 3 40.0

3. Merging data

In the example below, we are going to create a new dataset for PAYE tax for the countries we already defined: Rwanda, Tanzania, Burundi, and Kenya.
Once we have 2 datasets with a common identifier (country), we can merge the data using merge.data() and specify the common ID.

paye_data = read.table(text = "Country PAYE
                        Rwanda 23.3
                        Tanzania 26.2
                        Burundi 19.7
                        Kenya 24.5", header = T)

tax_data_merged = merge.data.frame(tax_data, paye_data, by="Country")
tax_data_merged

   Country  CIT PIT PAYE
1  Burundi 10.9 2.9 19.7
2    Kenya 15.8 5.2 24.5
3   Rwanda 12.3 2.5 23.3
4 Tanzania 14.2 4.1 26.2

Merging using cbind() function

In order to apply the cbind() command we first nee to sorb the key (common) identifier.

tax_data_ordered = tax_data[order(tax_data$Country),]
paye_data_odered = paye_data[order(paye_data$Country),]
PAYE = paye_data_odered$PAYE
tax_data_merged_2 = cbind(tax_data_ordered, PAYE)
tax_data_merged_2

   Country  CIT PIT PAYE
3  Burundi 10.9 2.9 19.7
4    Kenya 15.8 5.2 24.5
1   Rwanda 12.3 2.5 23.3
2 Tanzania 14.2 4.1 26.2

4. Appending the datasets

In order to append the datasets, use rbind(). Note that the datasets must have exactly the same column names. In the following script, we created a new dataframe for Uganda and append it to the existing dataframe.

additional_tax_data = read.table(text = "Country PIT PAYE CIT
                                 Uganda 2.8 21.6 11.9", header = T)
additional_tax_data

  Country PIT PAYE  CIT
1  Uganda 2.8 21.6 11.9

tax_data_expanded = rbind(tax_data_merged, additional_tax_data)
tax_data_expanded

   Country  CIT PIT PAYE
1  Burundi 10.9 2.9 19.7
2    Kenya 15.8 5.2 24.5
3   Rwanda 12.3 2.5 23.3
4 Tanzania 14.2 4.1 26.2
5   Uganda 11.9 2.8 21.6

Subseting

This is used when you want to select a part of the dataset for a given purpose.

subset_1 = tax_data_expanded[3,] #Selecting case number 3 (3rd row) with all variables

subset_2 = tax_data_expanded[2:4,] #Selecting from 2nd to 4th cases
subset_3 = tax_data_expanded[-3,] #Removing only the third and keep others

subset_4 = tax_data_expanded[,"Country"] #Keeping only the first column (variable). You can also use tax_data_expanded[,0]
subset_5 = tax_data_expanded[,1:3] #Keeping only the first 3 columns. You can also use tax_data_expanded[,c("Country", "CIT", "PIT")]
subset_6 = tax_data_expanded[,-3] #Removing only the third column and keep others

subset_7 = tax_data_expanded[c(3,4),1:3] #This will keep 3rd and 4th rows (Rwanda and Tanzania) and only 3 columns (Country name, CIT and PIT)
subset_7

   Country  CIT PIT
3   Rwanda 12.3 2.5
4 Tanzania 14.2 4.1

Handling missing data

Method 1 - subset analysis

With this method, we only consider complete cases

Let’s create another column and bind it to our dataset text_dataset
Select only complete cases using complete.cases() function. This function will drop all cases with missing age

Age_yrs = c(23,21,26,32,35,NA,34,32,40,NA,35,NA,23,23,34,NA,37,32,33,19,NA,20,23,NA,26,43,26,29,21,32,36,NA,NA,34,37,23,23,23,NA,27,32,33,32,36,NA,35,38,34,42,36,27,31,37,32,NA,28,26,NA,43,27,46,32,NA,NA,30)

text_dataset <- text_dataset[, !grepl("Age", names(text_dataset))] # First drop the column names containing Age
text_dataset = data.frame(text_dataset, Age_yrs)
text_dataset_dropna = text_dataset[complete.cases(text_dataset$Age_yrs), ]
nrow(text_dataset_dropna)

[1] 51

Method 2 - imputation (replacing missing values with mean or median)

With this method, the missing value are replaced by mean or median of valid numeric values

# impute NA with mean
text_dataset_imputena = text_dataset
text_dataset_imputena$Age_yrs = ifelse(is.na(text_dataset_imputena$Age_yrs), round(mean(text_dataset_imputena$Age_yrs, na.rm = TRUE), digits = 0), text_dataset_imputena$Age_yrs)
nrow(text_dataset_imputena)

[1] 65

# impute NA with median
text_dataset_imputena = text_dataset
text_dataset_imputena$Age_yrs = ifelse(is.na(text_dataset_imputena$Age_yrs), round(median(text_dataset_imputena$Age_yrs, na.rm = TRUE), digits = 0), text_dataset_imputena$Age_yrs)
nrow(text_dataset_imputena)

[1] 65

text_dataset_imputena = text_dataset
text_dataset_imputena$Age_yrs = ifelse(is.na(text_dataset_imputena$Age_yrs), mode(text_dataset_imputena$Age_yrs), text_dataset_imputena$Age_yrs)
nrow(text_dataset_imputena)

[1] 65

Summary statistics and data analysis

1. Summary stats (mean, median, mode, standard deviation)

#Mean
mean(text_dataset$Height)

[1] 155.1231

#Median
median(text_dataset$Height)

[1] 155

#Quartile
quantile(text_dataset$Height)

  0%  25%  50%  75% 100% 
 136  147  155  165  174

#All summary stats
summary(text_dataset$Height)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  136.0   147.0   155.0   155.1   165.0   174.0

2. Cross tabulation

The following script extract a dataframe called UCBAdmission and produce summary statistics, then the contigency table (2 way table) for admission status VS gender. Furthermore, the percentages were computed.

admissions = data.frame(UCBAdmissions)
variable.names(admissions)

[1] "Admit"  "Gender" "Dept"   "Freq"

summary(admissions)

      Admit       Gender   Dept       Freq      
 Admitted:12   Male  :12   A:4   Min.   :  8.0  
 Rejected:12   Female:12   B:4   1st Qu.: 80.0  
                           C:4   Median :170.0  
                           D:4   Mean   :188.6  
                           E:4   3rd Qu.:302.5  
                           F:4   Max.   :512.0

crosstab1 = xtabs(Freq~Gender+Admit, data = admissions) # Crosstab for frequencies
prop.table(crosstab1,1) # Percentage (1 is used to get row %, while 2 is used for col percent)

        Admit
Gender    Admitted  Rejected
  Male   0.4451877 0.5548123
  Female 0.3035422 0.6964578

3. Chi-square test for independence

Chi-square is the test for independence between 2 categorical variables. The null hypothesis for chi-square test assumes that each of the 2 categorical variable is independent from another. In our example, we are going to test the independency test between admission status and gender of the applicant. Remember the null hypothesis for Chi-square test assumes that each of the two variables is independent from the other (no relationship between the 2)

chisq.test(admissions$Admit, admissions$Gender)


    Pearson's Chi-squared test

data:  admissions$Admit and admissions$Gender
X-squared = 0, df = 1, p-value = 1

From our test results, we reject the null hypothesis, hence conclude that there is a statistical relationship between the 2.

4. Linear model and regression analysis

air_data = data.frame(airquality)

plot(air_data$Wind, air_data$Temp)
cor(air_data$Wind, air_data$Temp)

[1] -0.4579879

cor.test(air_data$Wind, air_data$Temp)


    Pearson's product-moment correlation

data:  air_data$Wind and air_data$Temp
t = -6.3308, df = 151, p-value = 2.642e-09
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.5748874 -0.3227660
sample estimates:
       cor 
-0.4579879

plot(air_data$Wind, air_data$Temp)
linear_model = lm(Temp~Wind, data = air_data)
abline((linear_model)$coefficient)
lines(lowess(air_data$Wind, air_data$Temp), col="blue")

summary(linear_model)


Call:
lm(formula = Temp ~ Wind, data = air_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-23.291  -5.723   1.709   6.016  19.199 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  90.1349     2.0522  43.921  < 2e-16 ***
Wind         -1.2305     0.1944  -6.331 2.64e-09 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 8.442 on 151 degrees of freedom
Multiple R-squared:  0.2098,    Adjusted R-squared:  0.2045 
F-statistic: 40.08 on 1 and 151 DF,  p-value: 2.642e-09

Data management with “dplyr” function

The dplyr function is widely used in data management. The

#Start by installing the packages if not installed
#install.packages("dplyr")
#install.packages("nycflights13")
#install.packages("tidyverse)


library(dplyr)


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

library(nycflights13)
library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ forcats   1.0.0     ✔ readr     2.1.5
✔ ggplot2   3.5.2     ✔ stringr   1.5.1
✔ lubridate 1.9.4     ✔ tibble    3.3.0
✔ purrr     1.1.0     ✔ tidyr     1.3.1

── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

flights_data = nycflights13::flights
#View(flights_data)

Filter The filter function is shown below:

#Filtering data
jan_flights = filter(flights_data, month==1) #interested in flights that took in Jan
dec_flights = filter(flights_data, month==12)
apr_flights = filter(flights_data, month==4)
ny_flights = filter(flights_data, month==1, day==1)
xmas_flights = filter(flights_data, month==12, day==25)
dec_jan_flights = filter(flights_data, month==12 | month==1)
jan_june_flights = filter(flights_data, month %in% c(1,2,3,4,5,6))

nrow(jan_flights)

[1] 27004

nrow(dec_flights)

[1] 28135

nrow(apr_flights)

[1] 28330

str(dec_jan_flights)

tibble [55,139 × 19] (S3: tbl_df/tbl/data.frame)
 $ year          : int [1:55139] 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
 $ month         : int [1:55139] 1 1 1 1 1 1 1 1 1 1 ...
 $ day           : int [1:55139] 1 1 1 1 1 1 1 1 1 1 ...
 $ dep_time      : int [1:55139] 517 533 542 544 554 554 555 557 557 558 ...
 $ sched_dep_time: int [1:55139] 515 529 540 545 600 558 600 600 600 600 ...
 $ dep_delay     : num [1:55139] 2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
 $ arr_time      : int [1:55139] 830 850 923 1004 812 740 913 709 838 753 ...
 $ sched_arr_time: int [1:55139] 819 830 850 1022 837 728 854 723 846 745 ...
 $ arr_delay     : num [1:55139] 11 20 33 -18 -25 12 19 -14 -8 8 ...
 $ carrier       : chr [1:55139] "UA" "UA" "AA" "B6" ...
 $ flight        : int [1:55139] 1545 1714 1141 725 461 1696 507 5708 79 301 ...
 $ tailnum       : chr [1:55139] "N14228" "N24211" "N619AA" "N804JB" ...
 $ origin        : chr [1:55139] "EWR" "LGA" "JFK" "JFK" ...
 $ dest          : chr [1:55139] "IAH" "IAH" "MIA" "BQN" ...
 $ air_time      : num [1:55139] 227 227 160 183 116 150 158 53 140 138 ...
 $ distance      : num [1:55139] 1400 1416 1089 1576 762 ...
 $ hour          : num [1:55139] 5 5 5 5 6 5 6 6 6 6 ...
 $ minute        : num [1:55139] 15 29 40 45 0 58 0 0 0 0 ...
 $ time_hour     : POSIXct[1:55139], format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...

Arrange function

help(arrange)
arrange(flights_data, carrier, year, month, day) #used to sort data based one one or multiple columns

# A tibble: 336,776 × 19
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013     1     1      810            810         0     1048           1037
 2  2013     1     1     1451           1500        -9     1634           1636
 3  2013     1     1     1452           1455        -3     1637           1639
 4  2013     1     1     1454           1500        -6     1635           1636
 5  2013     1     1     1507           1515        -8     1651           1656
 6  2013     1     1     1530           1530         0     1650           1655
 7  2013     1     1     1546           1540         6     1753           1748
 8  2013     1     1     1550           1550         0     1844           1831
 9  2013     1     1     1552           1600        -8     1749           1757
10  2013     1     1     1554           1600        -6     1701           1734
# ℹ 336,766 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>

arrange(flights_data, desc(month), desc(day)) #descending ordering

# A tibble: 336,776 × 19
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013    12    31       13           2359        14      439            437
 2  2013    12    31       18           2359        19      449            444
 3  2013    12    31       26           2245       101      129           2353
 4  2013    12    31      459            500        -1      655            651
 5  2013    12    31      514            515        -1      814            812
 6  2013    12    31      549            551        -2      925            900
 7  2013    12    31      550            600       -10      725            745
 8  2013    12    31      552            600        -8      811            826
 9  2013    12    31      553            600        -7      741            754
10  2013    12    31      554            550         4     1024           1027
# ℹ 336,766 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>

Select function (used when interested in specific variables)

selected_flights = select(flights_data, year, month, day) #only keep three variables (year, month, and day)

selected_flights = select(flights_data, (year:arr_time)) #keeping all columns between year and arr_time

selected_flights = select(flights_data, -(year:arr_time)) #dropping all variables from column year to arr_time

selected_flights

# A tibble: 336,776 × 12
   sched_arr_time arr_delay carrier flight tailnum origin dest  air_time
            <int>     <dbl> <chr>    <int> <chr>   <chr>  <chr>    <dbl>
 1            819        11 UA        1545 N14228  EWR    IAH        227
 2            830        20 UA        1714 N24211  LGA    IAH        227
 3            850        33 AA        1141 N619AA  JFK    MIA        160
 4           1022       -18 B6         725 N804JB  JFK    BQN        183
 5            837       -25 DL         461 N668DN  LGA    ATL        116
 6            728        12 UA        1696 N39463  EWR    ORD        150
 7            854        19 B6         507 N516JB  EWR    FLL        158
 8            723       -14 EV        5708 N829AS  LGA    IAD         53
 9            846        -8 B6          79 N593JB  JFK    MCO        140
10            745         8 AA         301 N3ALAA  LGA    ORD        138
# ℹ 336,766 more rows
# ℹ 4 more variables: distance <dbl>, hour <dbl>, minute <dbl>,
#   time_hour <dttm>

Mutate function (used to create a new variable)

flights_data_2 = select(flights, year:day, ends_with("delay"), distance, ends_with("time"))
flights_data_3 = mutate(flights_data_2, gain=arr_delay-dep_delay, speed=round(distance/air_time*60,digits = 1))
flights_data_3

# A tibble: 336,776 × 13
    year month   day dep_delay arr_delay distance dep_time sched_dep_time
   <int> <int> <int>     <dbl>     <dbl>    <dbl>    <int>          <int>
 1  2013     1     1         2        11     1400      517            515
 2  2013     1     1         4        20     1416      533            529
 3  2013     1     1         2        33     1089      542            540
 4  2013     1     1        -1       -18     1576      544            545
 5  2013     1     1        -6       -25      762      554            600
 6  2013     1     1        -4        12      719      554            558
 7  2013     1     1        -5        19     1065      555            600
 8  2013     1     1        -3       -14      229      557            600
 9  2013     1     1        -3        -8      944      557            600
10  2013     1     1        -2         8      733      558            600
# ℹ 336,766 more rows
# ℹ 5 more variables: arr_time <int>, sched_arr_time <int>, air_time <dbl>,
#   gain <dbl>, speed <dbl>

Rename function (used for renaming variables)

renamed_flights_data = rename(flights_data_3, scheduled_dep_time = sched_dep_time, scheduled_arr_time = sched_arr_time)
names(renamed_flights_data)

 [1] "year"               "month"              "day"               
 [4] "dep_delay"          "arr_delay"          "distance"          
 [7] "dep_time"           "scheduled_dep_time" "arr_time"          
[10] "scheduled_arr_time" "air_time"           "gain"              
[13] "speed"

group_by() and %>%

Used to display outputs by groups. “%>%” allows us to chain multiple operations together, improving code readability.

test_data = mtcars
group_by(test_data, cyl)

# A tibble: 32 × 11
# Groups:   cyl [3]
     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1  21       6  160    110  3.9   2.62  16.5     0     1     4     4
 2  21       6  160    110  3.9   2.88  17.0     0     1     4     4
 3  22.8     4  108     93  3.85  2.32  18.6     1     1     4     1
 4  21.4     6  258    110  3.08  3.22  19.4     1     0     3     1
 5  18.7     8  360    175  3.15  3.44  17.0     0     0     3     2
 6  18.1     6  225    105  2.76  3.46  20.2     1     0     3     1
 7  14.3     8  360    245  3.21  3.57  15.8     0     0     3     4
 8  24.4     4  147.    62  3.69  3.19  20       1     0     4     2
 9  22.8     4  141.    95  3.92  3.15  22.9     1     0     4     2
10  19.2     6  168.   123  3.92  3.44  18.3     1     0     4     4
# ℹ 22 more rows

by_cyl <- mtcars %>% group_by(cyl)


#using %>%
test_data %>%
  group_by(cyl) %>%
  summarise(mean_weight = mean(wt)) %>%
  arrange(cyl)

# A tibble: 3 × 2
    cyl mean_weight
  <dbl>       <dbl>
1     4        2.29
2     6        3.12
3     8        4.00

Data visualization with ggplot2

#install package if not installed
#install.packages("ggplot2")
library(ggplot2)

cars_data = data.frame(mtcars)
table(cars_data$cyl)


 4  6  8 
11  7 14

#Bar plot
barplot(table(cars_data$cyl), main = "Car cylinders", col="orange", border="orange", ylab="Frequency", xlab="Number of cylinders")

#Dot plot
dotchart(cars_data$mpg, pch=3, labels = row.names(cars_data), cex=.5, main="Miles per gallon of the car model", xlab="MPG")

#Scatter plot
plot(cars_data$mpg~cars_data$wt, main="Automobile data", xlab="Weight", ylab="Miles per gallon", col="blue", pch=10)

ggplot(data=cars_data, aes(x=wt,y=mpg))+geom_point(size=1, col="blue")+labs(title = "Automobile data", x="Weight", y="Miles per gallon")+geom_smooth(method = "lm", color="orange")

`geom_smooth()` using formula = 'y ~ x'

#Comparision chart
cars_data$am = factor(cars_data$am, levels = c(0,1), labels = c("Automatic", "Manual"))
cars_data$vs = factor(cars_data$vs, levels = c(0,1), labels = c("V-Engine", "Straight Engine"))
cars_data$cyl = factor(cars_data$cyl)
ggplot(data=cars_data, aes(x=hp,y=mpg, shape=cyl, color=cyl))+geom_point(size=2)+facet_grid(am~vs)+labs(title = "Automobile data by engine type", x="HorsePower", y="Miles per gallon")

basic_gg_plot = ggplot(data=cars_data, aes(x=wt,y=mpg))+geom_point(size=1, col="blue")+labs(title = "Automobile data", x="Weight", y="Miles per gallon")

geom_hline() adds a horizontal line. The plot below presents an average milles per gallow

ggplot(data=cars_data, aes(x=wt,y=mpg))+geom_point(size=1, col="blue")+labs(title = "Automobile data", x="Weight", y="Miles per gallon")+geom_hline(yintercept = mean(cars_data$mpg), linetype = "dashed", color = "red") + geom_text(aes(x = 5.2, y = 21.4, label = "Average MPG"), color = "red")

Warning in geom_text(aes(x = 5.2, y = 21.4, label = "Average MPG"), color = "red"): All aesthetics have length 1, but the data has 32 rows.
ℹ Please consider using `annotate()` or provide this layer with data containing
  a single row.

The geom_jitter() function is used to create a jittered scatter plot. It adds a small amount of random noise to the data points along the x-axis or y-axis, helping to visualize the distribution of points when they overlap.

ggplot(data=cars_data, aes(x=wt,y=mpg))+geom_point(size=1, col="blue")+labs(title = "Automobile data", x="Weight", y="Miles per gallon")+geom_jitter(width = 0.2, height = 0.2, color = "grey") +
  theme_minimal()

ggplot(data=cars_data, aes(x=wt,y=mpg))+geom_point(size=1, col="blue")+labs(title = "Automobile data", x="Weight", y="Miles per gallon")+geom_jitter(width = 0.2, height = 0.2, color = "grey") +
  theme_minimal()

help(geom_rug)

The geom_line() function is used to add lines to a plot. It connects the data points in the order they appear in the data frame, creating a line chart.

ggplot(data=cars_data, aes(x=wt,y=mpg))+geom_point(size=2, col="blue")+labs(title = "Automobile data", x="Weight", y="Miles per gallon")+geom_line(color = "blue")

The geom_rug() function is used to add marginal rugs to a plot. Rugs are small lines or ticks that extend along the axes, indicating the distribution of data points along a particular axis. geom_rug() can be used with either the x-axis or y-axis, or both.

ggplot(data=cars_data, aes(x=wt,y=mpg))+geom_point(size=2, col="blue")+labs(title = "Automobile data", x="Weight", y="Miles per gallon")+geom_rug(sides = "bl")

Control structure

If and else
For
While
Repeat
Break
Next

Functions in R

f = function(x) x^2
formals(f)

$x

body(f)

x^2

f(5)

[1] 25

p_values = c(0.03,0.034,0.23,0.001,0.08)
significance = ifelse(p_values < 0.05, "Significant", "Not significant")
significance

[1] "Significant"     "Significant"     "Not significant" "Significant"    
[5] "Not significant"

Generating random numbers following a given distribution

runif(10,0,20) #this function will generate 10 observations within the interval [0,20], and follows uniform distribution

 [1]  7.868876  8.083936 13.817482  4.523355 19.844741  3.227845  5.281990
 [8]  6.744382  3.828884 18.001967

rpois(10, 12) #this will generate 10 values following a poison distribution with lambda=12

 [1] 11 10 11 17  9 12 11 17  8 14

ran_data = runif(10,0,20)

apply() and lapply functions

apply(data, margin, function) Data stands for your dataset, margin (1 stands for rows, 2 stands for columns), function: eg mean, sum,…
lapply() function is used when you have a custom function

Titanic data

str(Titanic)

 'table' num [1:4, 1:2, 1:2, 1:2] 0 0 35 0 0 0 17 0 118 154 ...
 - attr(*, "dimnames")=List of 4
  ..$ Class   : chr [1:4] "1st" "2nd" "3rd" "Crew"
  ..$ Sex     : chr [1:2] "Male" "Female"
  ..$ Age     : chr [1:2] "Child" "Adult"
  ..$ Survived: chr [1:2] "No" "Yes"

apply(Titanic, c(3,2,4), sum)

, , Survived = No

       Sex
Age     Male Female
  Child   35     17
  Adult 1329    109

, , Survived = Yes

       Sex
Age     Male Female
  Child   29     28
  Adult  338    316

summary(Titanic)

Number of cases in table: 2201 
Number of factors: 4 
Test for independence of all factors:
    Chisq = 1637.4, df = 25, p-value = 0
    Chi-squared approximation may be incorrect

numbers = list(a = 1, b = 2, c = 3)
sapply(numbers, function(x) x * 2)

a b c 
2 4 6

#The vapply function is similar to sapply, but it allows you to specify the type of the result. This can be useful when you want to ensure the result has a specific structure.
vapply(numbers, function(x) x * 2, numeric(1))

a b c 
2 4 6

T-test

T-test is used to perform one and two sample t-tests on vectors of data.

t.test(mpg~vs, data=mtcars)


    Welch Two Sample t-test

data:  mpg by vs
t = -4.6671, df = 22.716, p-value = 0.0001098
alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
95 percent confidence interval:
 -11.462508  -4.418445
sample estimates:
mean in group 0 mean in group 1 
       16.61667        24.55714

Regression analysis

str(women)

'data.frame':   15 obs. of  2 variables:
 $ height: num  58 59 60 61 62 63 64 65 66 67 ...
 $ weight: num  115 117 120 123 126 129 132 135 139 142 ...

fit = lm(weight~height, data=women)
plot(resid(fit))
fit


Call:
lm(formula = weight ~ height, data = women)

Coefficients:
(Intercept)       height  
     -87.52         3.45

abline(fit$coefficient)

round(fitted(fit), digits = 0)

  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15 
113 116 119 123 126 130 133 137 140 144 147 151 154 157 161

summary(fit)


Call:
lm(formula = weight ~ height, data = women)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.7333 -1.1333 -0.3833  0.7417  3.1167 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -87.51667    5.93694  -14.74 1.71e-09 ***
height        3.45000    0.09114   37.85 1.09e-14 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.525 on 13 degrees of freedom
Multiple R-squared:  0.991, Adjusted R-squared:  0.9903 
F-statistic:  1433 on 1 and 13 DF,  p-value: 1.091e-14

fit2 = lm(weight ~ poly(height, degree = 2, raw = TRUE), data = women)
summary(fit2)


Call:
lm(formula = weight ~ poly(height, degree = 2, raw = TRUE), data = women)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.50941 -0.29611 -0.00941  0.28615  0.59706 

Coefficients:
                                       Estimate Std. Error t value Pr(>|t|)    
(Intercept)                           261.87818   25.19677  10.393 2.36e-07 ***
poly(height, degree = 2, raw = TRUE)1  -7.34832    0.77769  -9.449 6.58e-07 ***
poly(height, degree = 2, raw = TRUE)2   0.08306    0.00598  13.891 9.32e-09 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.3841 on 12 degrees of freedom
Multiple R-squared:  0.9995,    Adjusted R-squared:  0.9994 
F-statistic: 1.139e+04 on 2 and 12 DF,  p-value: < 2.2e-16

Regression diagnosis

par(mfrow = c(2,2))
plot(fit)

plot(fit2)

fit3 = lm(weight~height+I(height^2), data=women)
par(mfrow = c(2,2))
plot(fit3)


#Multiple regression
air_reg = lm(Temp~Wind+Ozone, data=airquality)
summary(air_reg)


Call:
lm(formula = Temp ~ Wind + Ozone, data = airquality)

Residuals:
    Min      1Q  Median      3Q     Max 
-21.487  -5.137   1.220   4.674  13.102 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 74.18035    2.96526  25.017  < 2e-16 ***
Wind        -0.37829    0.22080  -1.713   0.0894 .  
Ozone        0.17615    0.02393   7.362 3.15e-11 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 6.762 on 113 degrees of freedom
  (37 observations deleted due to missingness)
Multiple R-squared:  0.5007,    Adjusted R-squared:  0.4918 
F-statistic: 56.65 on 2 and 113 DF,  p-value: < 2.2e-16

Parametric tests

library(MASS)


Attaching package: 'MASS'

The following object is masked from 'package:dplyr':

    select

data("UScrime")
head(UScrime)

    M So  Ed Po1 Po2  LF  M.F Pop  NW  U1 U2 GDP Ineq     Prob    Time    y
1 151  1  91  58  56 510  950  33 301 108 41 394  261 0.084602 26.2011  791
2 143  0 113 103  95 583 1012  13 102  96 36 557  194 0.029599 25.2999 1635
3 142  1  89  45  44 533  969  18 219  94 33 318  250 0.083401 24.3006  578
4 136  0 121 149 141 577  994 157  80 102 39 673  167 0.015801 29.9012 1969
5 141  0 121 109 101 591  985  18  30  91 20 578  174 0.041399 21.2998 1234
6 121  0 110 118 115 547  964  25  44  84 29 689  126 0.034201 20.9995  682

## independent t test
t.test(Prob~So, data=UScrime)


    Welch Two Sample t-test

data:  Prob by So
t = -3.8954, df = 24.925, p-value = 0.0006506
alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
95 percent confidence interval:
 -0.03852569 -0.01187439
sample estimates:
mean in group 0 mean in group 1 
     0.03851265      0.06371269

## Paired test
# t.test(y1,y2,paired=TRUE)
sapply(UScrime[c("U1","U2")],function(x)(c(mean=mean(x),sd=sd(x))))

           U1       U2
mean 95.46809 33.97872
sd   18.02878  8.44545

with(UScrime,t.test(U1,U2,paired=TRUE))


    Paired t-test

data:  U1 and U2
t = 32.407, df = 46, p-value < 2.2e-16
alternative hypothesis: true mean difference is not equal to 0
95 percent confidence interval:
 57.67003 65.30870
sample estimates:
mean difference 
       61.48936

Non Parametric test

#Wilcoxon
wilcox.test(Prob~So, data=UScrime)


    Wilcoxon rank sum exact test

data:  Prob by So
W = 81, p-value = 8.488e-05
alternative hypothesis: true location shift is not equal to 0

#Kruskal wallis
states<-data.frame(state.region,state.x77)
kruskal.test(Illiteracy~state.region,data=states)


    Kruskal-Wallis rank sum test

data:  Illiteracy by state.region
Kruskal-Wallis chi-squared = 22.672, df = 3, p-value = 4.726e-05

Simulation

Generating random numbers

x <- rnorm(10)
x <- rnorm(10, 20, 2)

set.seed(20)
x <- rnorm(100)
e <- rnorm(100, 0, 2)
y <- 0.5 + 2 * x + e
plot(x, y)

Data Visualization

country<-c("Australia", "Austria", "Belgium","Canada", "Denmark", "Finland","France","Germany",
"Greece","Ireland","Italy","Japan", "Netherland",
"New Zealand", "Norway", "Portugal", "Spain","Sweden", "Switzerland","UK","USA")

Income.inequality<- c(7.0,4.8,4.6,5.6,4.3,3.7,5.6,5.2,6.2,6.0,6.7,3.4,5.3,6.8,3.9,8.0,5.5,4.0,5.7 ,7.2,8.6)
Index.HS<-c(0.07,0.01,-0.23,-0.07,-0.19,-0.43,0.05,-0.06,0.38,0.25,-0.12,- 1.26,-0.51,0.29,-0.63
,1.18,-0.30,-0.83,-0.46,0.79,2.02)

GDP_WB<- c(45926,47682,43435,45066,45537,30676,39328,46401,26851,49393,35463,36319,48253,37679,65615
,28760,33629,45297,59540,40233,54630) 

data.21<-data.frame(country,Income.inequality,Index.HS,GDP_WB)
plot(data.21[c("Income.inequality","Index.HS")])

Index_inequality.df<-data.21[c("Income.inequality","Index.HS")] 
str(Index_inequality.df)

'data.frame':   21 obs. of  2 variables:
 $ Income.inequality: num  7 4.8 4.6 5.6 4.3 3.7 5.6 5.2 6.2 6 ...
 $ Index.HS         : num  0.07 0.01 -0.23 -0.07 -0.19 -0.43 0.05 -0.06 0.38 0.25 ...

(country<-data.21[,"country"])

 [1] "Australia"   "Austria"     "Belgium"     "Canada"      "Denmark"    
 [6] "Finland"     "France"      "Germany"     "Greece"      "Ireland"    
[11] "Italy"       "Japan"       "Netherland"  "New Zealand" "Norway"     
[16] "Portugal"    "Spain"       "Sweden"      "Switzerland" "UK"         
[21] "USA"

(country.2<-data.21["country"])

       country
1    Australia
2      Austria
3      Belgium
4       Canada
5      Denmark
6      Finland
7       France
8      Germany
9       Greece
10     Ireland
11       Italy
12       Japan
13  Netherland
14 New Zealand
15      Norway
16    Portugal
17       Spain
18      Sweden
19 Switzerland
20          UK
21         USA

str(country)

 chr [1:21] "Australia" "Austria" "Belgium" "Canada" "Denmark" "Finland" ...

str(country.2)

'data.frame':   21 obs. of  1 variable:
 $ country: chr  "Australia" "Austria" "Belgium" "Canada" ...

plot(Index_inequality.df,pch=20)
text(Index_inequality.df,labels=country, cex=.75, pos=1)

#Handling overlapping texts
which(country %in% c("Austria","Denmark","Germany","Netherland"))

[1]  2  5  8 13

text.left<-which(country %in% c("Austria","Denmark","Germany","Netherland"))
text.left

[1]  2  5  8 13

text.right<-setdiff(1:nrow(data.21),text.left) 
text.right

 [1]  1  3  4  6  7  9 10 11 12 14 15 16 17 18 19 20 21

pos.text<-ifelse(1:nrow(data.21)%in% text.left,2,4) 
plot(Index_inequality.df,pch=20,col="orange",xlim=c(3,9),ylim=c(-1.5,2.5))
text(Index_inequality.df,labels=country,pos=pos.text,cex=0.7)

text.up<-which(country %in% "Germany") 
text.up

[1] 8

text.left<-setdiff(1:nrow(data.21),c(text.right,text.up)) 
text.left

[1]  2  5 13

pos.text<-ifelse(1:nrow(data.21) %in% text.up,3,ifelse(1:nrow(data.21)%in% text.left,2,4))
plot(Index_inequality.df,pch=20,col="orange",xlim=c(3,9),ylim=c(- 1.5,2.5),ann=FALSE)
text(Index_inequality.df,labels = country,pos=pos.text,cex=0.7) 
main.title<-"Income inequality vs Index of Health and Social problems" 
x.lab<-"Income inequality (5th Ratio)"
y.lab<-"Index of Health and Social problems" 
title(main=main.title,xlab=x.lab,ylab=y.lab)


title(main=main.title,xlab=x.lab,ylab=y.lab) 
mtext(c("Better","Worse"),side=2,at=c(-1.8,2.8),las=1) 
text(x=5,y=1.5,labels=paste("r=",round(cor(Index_inequality.df[1],Index_inequality.df[2]),digits=2)))