This document was produced by Celestin Niyomugabo (celestin@vonsung.co.rw).
Visit
www.CelestinNiyomugabo.com
for booking arrangements.
Note: Some of the data used in this document are dummy or synthetic and can not be cited for reference. They are only used for educational purpose.
The vector can be created using c and can be assigned to
a name using equal assignment = or
leftward assignment <-. The scripts
below shows how to create:
vector <- c(12,34,56,78,97)
bolean <- c(TRUE, FALSE)
employees_names = c("Joe", "Doe", "Claire", "Peter", "Tom")
employees_age = c(23,43,25,31,48)
employees_salary = c(12000,23500,42100,34500,69200)
employee_sex = c("Male", "Male", "Female", "Male", "Female")
new_age = employees_age+3
# A vector od sequence of numbers, starting from 1 to 5
serial_number = c(1:5)
# A vector of complex numbers
complex_vector = c(1+0i, 1+2i)
print(complex_vector)
[1] 1+0i 1+2i
# The vectors (also known as column) can be combined using cbind()
employees = cbind(employees_names, employees_age, employees_salary, employee_sex)
print(employees)
employees_names employees_age employees_salary employee_sex
[1,] "Joe" "23" "12000" "Male"
[2,] "Doe" "43" "23500" "Male"
[3,] "Claire" "25" "42100" "Female"
[4,] "Peter" "31" "34500" "Male"
[5,] "Tom" "48" "69200" "Female"
# Matrix
new_matrix_1 = matrix(1:20, nrow = 4)
new_matrix_2 = matrix(4:11, ncol = 4)
# Matrix operation
matrix_product = new_matrix_2 %*% new_matrix_1
print(matrix_product)
[,1] [,2] [,3] [,4] [,5]
[1,] 80 192 304 416 528
[2,] 90 218 346 474 602
# Creating a list
new_list = list(employees_names, serial_number, matrix_product, bolean)
new_list
[[1]]
[1] "Joe" "Doe" "Claire" "Peter" "Tom"
[[2]]
[1] 1 2 3 4 5
[[3]]
[,1] [,2] [,3] [,4] [,5]
[1,] 80 192 304 416 528
[2,] 90 218 346 474 602
[[4]]
[1] TRUE FALSE
# A vector with missing value
with_missing = c(12,34,55,23, NA, 13, NA, 23)
na_values = is.na(with_missing)
filter_data = with_missing[!na_values]
filter_data
[1] 12 34 55 23 13 23
double_filter = complete.cases(employees_names, employees_salary)
To create a data frame, you cane use data.frame() or use
cbind
# DataFrame
employees_data = as.data.frame(employees)
employees_data = data.frame(employees_names, employees_age, employees_salary, employee_sex)
employees_data
employees_names employees_age employees_salary employee_sex
1 Joe 23 12000 Male
2 Doe 43 23500 Male
3 Claire 25 42100 Female
4 Peter 31 34500 Male
5 Tom 48 69200 Female
You can use summary() command to get the summary
statistics of a given variable. This includes the minimum and maximum,
mean, median, as long as quartiles.
mean(employees_data$employees_salary) # Mean for the column employees_salary
[1] 36260
summary(employees_data$employees_age) # Summary statistics for employees_age
Min. 1st Qu. Median Mean 3rd Qu. Max.
23 25 31 34 43 48
summary(employees_data) # Mean for the entire dataframe
employees_names employees_age employees_salary employee_sex
Length:5 Min. :23 Min. :12000 Length:5
Class :character 1st Qu.:25 1st Qu.:23500 Class :character
Mode :character Median :31 Median :34500 Mode :character
Mean :34 Mean :36260
3rd Qu.:43 3rd Qu.:42100
Max. :48 Max. :69200
The data frame can also be created by using data editor in R
dataset_1 = data.frame(height=numeric(0), weight=numeric(0))
# dataset_1 = edit(dataset_1)
R can read data from external files such as CSV, text files, stata files, etc. The following script shows the data imported from a CSV and a text file. The names used when creating the text file were sourced from this link: https://1000randomnames.com/ - just for practice purpose only. To start working with files, it’s better to first set the working directory
I also imported a Stata (.dta) file after installing
haven package.
# Set working directory
directory_of_current_file = dirname(file.path(getwd(), "Practice.Rmd")) # The file is currently saved at "/Users/vonsung/ownCloud - Celestin Niyomugabo@cloud.vonsung.io/AUCA - Big Data/R for Data Science/" from my computer
setwd(directory_of_current_file) # Setting current wd do where the file Paractice 1.Rmd is saved
csv_data = read.csv(file="datasets/sample data.csv", header= T)
text_dataset = read.table(file="datasets/sample data.txt", header = T, sep = "\t")
#install.packages("haven")
library("haven")
stata_dataset = read_dta(file="datasets/stata dataset.dta")
colname() can be used to view the names of the columns
(variables) for the data imported.head() to preview some
rows of the dataframe.View() to preview the entire dataframelength() to get the number of vqriables within the
dataframecolnames(text_dataset)
[1] "Name" "Age" "Height" "Weight" "Class"
head(stata_dataset)
# A tibble: 6 × 100
consent interview_date province district sector type sex dob
<dbl+lbl> <chr> <dbl+lbl> <dbl+lb> <dbl+lbl> <dbl+l> <dbl+l> <chr>
1 1 [Yes] 2023-05-13 4 [Norther… 45 [Rul… 4517 [Tum… 1 [Ind… 1 [Mal… 1959…
2 1 [Yes] 2023-05-14 4 [Norther… 45 [Rul… 4504 [Buy… 1 [Ind… 1 [Mal… 1959…
3 1 [Yes] 2023-05-14 4 [Norther… 45 [Rul… 4504 [Buy… 1 [Ind… 1 [Mal… 1983…
4 1 [Yes] 2023-05-14 4 [Norther… 45 [Rul… 4517 [Tum… 1 [Ind… 1 [Mal… 1964…
5 1 [Yes] 2023-05-14 4 [Norther… 45 [Rul… 4517 [Tum… 1 [Ind… 2 [Fem… 1964…
6 1 [Yes] 2023-05-14 4 [Norther… 45 [Rul… 4517 [Tum… 1 [Ind… 2 [Fem… 1983…
# ℹ 92 more variables: age <dbl>, education <dbl+lbl>, hh_size <dbl+lbl>,
# income1 <dbl+lbl>, income2 <dbl+lbl>, tea_area <dbl+lbl>,
# tea_income <dbl+lbl>, formal_credit <dbl+lbl>,
# formal_credit_count <dbl+lbl>, lender_institution_type <dbl+lbl>,
# other_lender_institution_type <chr>, formal_credit_size <dbl+lbl>,
# formal_payback_period <dbl+lbl>, formal_interest_rate <dbl>,
# formal_defaulted <dbl+lbl>, formal_defaulted_type <dbl+lbl>, …
#View(stata_dataset)
length(text_dataset)
[1] 5
1. Histogram plot (used for discret nueric data)
hist() to plot a histogram. main='' is
used to define the plot header/title,xlab='' and ylab='' define horizontal and
vertical axes respectively,col='' is used to define the color of histogram bars.
The color can either be defined using the color name (Eg: red, blue,
green, orange,..) or RGB HEX code (eg: #FFFFFF for white, #00FF00 for
green, etc…).border attribute is used to customize the border
color,yaxt = 'n' and xaxt = 'n' are used to hide
y and x axes respectively. To hide all axes, use
axes = Ftext() function to display the frequencies per
each age categoryhistogram = hist(text_dataset$Age, main="Histogram plot for age", xlab = "Age", ylab = "", col = "#aff218", border="#000000", freq = T, axes = T, yaxt = "n")
text(histogram$mids, histogram$counts, labels = histogram$counts, pos = 3, col = "black")
2.Stacked diagram and side diagram
Both stack diagram and side diagram can be plotted using
barplot() with the following arguments:
beside='' argument to False,beside='' to
True (set it to T or True)col='' need to be specified in a form of a vector
since we need to specify the color for each categorynames.arg = '' specify the grouping
category/variable.In the example below, a dataframe for different tax types was first created (Corporate Income Tax and Personal Income Tax) in different countries; then the bar plot was used.
# below we crete albirtary data for tax collections by country
cit = c(12.3, 14.2, 10.9, 15.8)
pit = c(2.5, 4.1, 2.9, 5.2)
countries = c("Rwanda", "Tanzania", "Burundi", "Kenya")
tax_data = data.frame(Country = countries, CIT = cit, PIT = pit) # A dataframe for taxes collected
tax_data
Country CIT PIT
1 Rwanda 12.3 2.5
2 Tanzania 14.2 4.1
3 Burundi 10.9 2.9
4 Kenya 15.8 5.2
barplot(rbind(tax_data$CIT, tax_data$PIT), beside = T, col = c("green", "gold"), names.arg = tax_data$Country, main = "CIT vs PIT collection in 2023 (in Bn USD)")
legend("topright", legend = c("CIT", "PIT"), fill = c("green", "gold"), cex = .6)
The following script will plot a stacked diagram since the
beside='' attribute was set to false:
barplot(rbind(tax_data$CIT, tax_data$PIT), beside = F, col = c("green", "gold"), names.arg = tax_data$Country, main = "CIT vs PIT collection in 2023 (in Bn USD)")
legend("topleft", legend = c("CIT", "PIT"), fill = c("green", "gold"), cex = .6)
In case you have a long dataset that you need to plot summarized/grouped statistics, you first need to group your data.
# Bar graph data from a dataframe
bar_data = aggregate(cbind(Height, Weight) ~ Class, data = text_dataset, function(x) round(mean(x), 1))
barplot_var = barplot(rbind(bar_data$Height, bar_data$Weight), beside = TRUE, col = c("#abf254", "#23af10"), names.arg = bar_data$Class, main = "Average height and weight for students", xlab = "", ylab = "Height/Weight average")
# Add legend
legend("topright", legend = c("Height", "Weight"), fill = c("#abf254", "#23af10"), cex = .6)
3. Pie chart
A pie chart can be used to present the contribution to the total
pie(tax_data$CIT, labels = tax_data$Country, main = "CIT Distribution", col = rainbow(length(tax_data$CIT)))
4. Scatter plot
A scatter plot is used to show the relationship between 2 continuous variables. In the example below, the height was plotted against weight of the respondent in our imported dataset.
plot(text_dataset$Height, text_dataset$Weight, main = "Height VS Weight of students", xlab = "Height (in cm)", ylab = "Weight (in kg)", col = "blue", pch=4) # pch attribute is used to define the dot type
5. Box plot
The box plot is used to test identify the outlier.
boxplot(text_dataset$Height~text_dataset$Class, main="Box plot for student height", xlab = "", ylab = "Height")
R installation come with some dataset deemed for practice purpose.
Use data() to print a list of the datasets that come with
R. To access a particular dataset (airquality for instance), use
dataframe(airquality)
#data()
airquality_data = data.frame(airquality)
head(airquality_data)
Ozone Solar.R Wind Temp Month Day
1 41 190 7.4 67 5 1
2 36 118 8.0 72 5 2
3 12 149 12.6 74 5 3
4 18 313 11.5 62 5 4
5 NA NA 14.3 56 5 5
6 28 NA 14.9 66 5 6
summary(airquality)
Ozone Solar.R Wind Temp
Min. : 1.00 Min. : 7.0 Min. : 1.700 Min. :56.00
1st Qu.: 18.00 1st Qu.:115.8 1st Qu.: 7.400 1st Qu.:72.00
Median : 31.50 Median :205.0 Median : 9.700 Median :79.00
Mean : 42.13 Mean :185.9 Mean : 9.958 Mean :77.88
3rd Qu.: 63.25 3rd Qu.:258.8 3rd Qu.:11.500 3rd Qu.:85.00
Max. :168.00 Max. :334.0 Max. :20.700 Max. :97.00
NA's :37 NA's :7
Month Day
Min. :5.000 Min. : 1.0
1st Qu.:6.000 1st Qu.: 8.0
Median :7.000 Median :16.0
Mean :6.993 Mean :15.8
3rd Qu.:8.000 3rd Qu.:23.0
Max. :9.000 Max. :31.0
airquality[2:4,4]
[1] 72 74 62
str(airquality) #to get data type
'data.frame': 153 obs. of 6 variables:
$ Ozone : int 41 36 12 18 NA 28 23 19 8 NA ...
$ Solar.R: int 190 118 149 313 NA NA 299 99 19 194 ...
$ Wind : num 7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
$ Temp : int 67 72 74 62 56 66 65 59 61 69 ...
$ Month : int 5 5 5 5 5 5 5 5 5 5 ...
$ Day : int 1 2 3 4 5 6 7 8 9 10 ...
summary(pressure)
temperature pressure
Min. : 0 Min. : 0.0002
1st Qu.: 90 1st Qu.: 0.1800
Median :180 Median : 8.8000
Mean :180 Mean :124.3367
3rd Qu.:270 3rd Qu.:126.5000
Max. :360 Max. :806.0000
R comes with sample data which can be accessed using
data().
variable.names(iris) # List all variable names of iris dataframe
[1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" "Species"
# structure(iris) # Get the structure of your data frame
iris[1:10,] #Print first 10 observations
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 5.1 3.5 1.4 0.2 setosa
2 4.9 3.0 1.4 0.2 setosa
3 4.7 3.2 1.3 0.2 setosa
4 4.6 3.1 1.5 0.2 setosa
5 5.0 3.6 1.4 0.2 setosa
6 5.4 3.9 1.7 0.4 setosa
7 4.6 3.4 1.4 0.3 setosa
8 5.0 3.4 1.5 0.2 setosa
9 4.4 2.9 1.4 0.2 setosa
10 4.9 3.1 1.5 0.1 setosa
mean(iris$Sepal.Length[iris$Species=="setosa"]) # Get sepal length mean withing setosa specie
[1] 5.006
mean(iris$Petal.Length[iris$Species=="setosa"]) # Get petal length mean withing setosa specie
[1] 1.462
# Get summary statistics by categories
by(iris$Petal.Length, iris$Species, mean)
iris$Species: setosa
[1] 1.462
------------------------------------------------------------
iris$Species: versicolor
[1] 4.26
------------------------------------------------------------
iris$Species: virginica
[1] 5.552
by(iris$Petal.Length, iris$Species, sd)
iris$Species: setosa
[1] 0.173664
------------------------------------------------------------
iris$Species: versicolor
[1] 0.469911
------------------------------------------------------------
iris$Species: virginica
[1] 0.5518947
by(iris$Petal.Length, iris$Species, summary)
iris$Species: setosa
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.000 1.400 1.500 1.462 1.575 1.900
------------------------------------------------------------
iris$Species: versicolor
Min. 1st Qu. Median Mean 3rd Qu. Max.
3.00 4.00 4.35 4.26 4.60 5.10
------------------------------------------------------------
iris$Species: virginica
Min. 1st Qu. Median Mean 3rd Qu. Max.
4.500 5.100 5.550 5.552 5.875 6.900
Box plot of built-in data
After accessing R built-in data, you can do some manipulations. The following script plots the boxplot using R built-in data
boxplot(iris$Sepal.Length~iris$Species, data=iris, ylab="Sepal length", xlab="Species")
1. Working with dates
You can use Sys.Date() which is the system date to get the current
date. To format a value as date, use as.Date. The
difftime() is used to compute the time difference. The
example belows shows how to calculate the number of ellapsed days from 2
date intervals.
today = Sys.Date()
dob = as.Date("2000-01-01")
difftime(today, dob, units = "days")
Time difference of 9354 days
2. Creating a new variable
The following script created a new variable named salary_to_age,
which is the ratio of the employee salary divided by his/her age. The
function round is used to round the new created variable to one decimal
see: digit=1
attach(employees_data)
The following objects are masked _by_ .GlobalEnv:
employee_sex, employees_age, employees_names, employees_salary
(salary_to_age = round((employees_salary/employees_age),digit=1))
[1] 521.7 546.5 1684.0 1112.9 1441.7
head(cbind(employees_data, salary_to_age))
employees_names employees_age employees_salary employee_sex salary_to_age
1 Joe 23 12000 Male 521.7
2 Doe 43 23500 Male 546.5
3 Claire 25 42100 Female 1684.0
4 Peter 31 34500 Male 1112.9
5 Tom 48 69200 Female 1441.7
Let’s create a new variable “BMI (weight to height ratio)” based on student height and weight:
BMI = round(text_dataset$Weight/(text_dataset$Height/100), digits = 1) #Height should be in meters
text_dataset = data.frame(text_dataset, BMI)
head(text_dataset)
Name Age Height Weight Class BMI
1 Lillie Galindo 52 148 87 Level 3 58.8
2 Salvatore Warner 17 151 91 Level 4 60.3
3 Wynter Novak 42 154 53 Level 3 34.4
4 Bishop Ramos 68 163 48 Level 3 29.4
5 Alice Cantu 39 142 90 Level 3 63.4
6 Anakin Gomez 26 147 67 Level 4 45.6
tail(text_dataset)
Name Age Height Weight Class BMI
60 Rogelio Wolf 59 150 66 Level 1 44.0
61 Jolene Meyer 33 167 61 Level 1 36.5
62 Tristan Chambers 63 151 89 Level 3 58.9
63 Makayla Spence 54 166 63 Level 3 38.0
64 Cillian Day 55 164 72 Level 2 43.9
65 Hayden Bryan 72 155 62 Level 3 40.0
3. Merging data
merge.data() and specify the common
ID.paye_data = read.table(text = "Country PAYE
Rwanda 23.3
Tanzania 26.2
Burundi 19.7
Kenya 24.5", header = T)
tax_data_merged = merge.data.frame(tax_data, paye_data, by="Country")
tax_data_merged
Country CIT PIT PAYE
1 Burundi 10.9 2.9 19.7
2 Kenya 15.8 5.2 24.5
3 Rwanda 12.3 2.5 23.3
4 Tanzania 14.2 4.1 26.2
Merging using cbind() function
In order to apply the cbind() command we first nee to
sorb the key (common) identifier.
tax_data_ordered = tax_data[order(tax_data$Country),]
paye_data_odered = paye_data[order(paye_data$Country),]
PAYE = paye_data_odered$PAYE
tax_data_merged_2 = cbind(tax_data_ordered, PAYE)
tax_data_merged_2
Country CIT PIT PAYE
3 Burundi 10.9 2.9 19.7
4 Kenya 15.8 5.2 24.5
1 Rwanda 12.3 2.5 23.3
2 Tanzania 14.2 4.1 26.2
4. Appending the datasets
In order to append the datasets, use rbind(). Note that
the datasets must have exactly the same column names. In the following
script, we created a new dataframe for Uganda and append it to the
existing dataframe.
additional_tax_data = read.table(text = "Country PIT PAYE CIT
Uganda 2.8 21.6 11.9", header = T)
additional_tax_data
Country PIT PAYE CIT
1 Uganda 2.8 21.6 11.9
tax_data_expanded = rbind(tax_data_merged, additional_tax_data)
tax_data_expanded
Country CIT PIT PAYE
1 Burundi 10.9 2.9 19.7
2 Kenya 15.8 5.2 24.5
3 Rwanda 12.3 2.5 23.3
4 Tanzania 14.2 4.1 26.2
5 Uganda 11.9 2.8 21.6
This is used when you want to select a part of the dataset for a given purpose.
subset_1 = tax_data_expanded[3,] #Selecting case number 3 (3rd row) with all variables
subset_2 = tax_data_expanded[2:4,] #Selecting from 2nd to 4th cases
subset_3 = tax_data_expanded[-3,] #Removing only the third and keep others
subset_4 = tax_data_expanded[,"Country"] #Keeping only the first column (variable). You can also use tax_data_expanded[,0]
subset_5 = tax_data_expanded[,1:3] #Keeping only the first 3 columns. You can also use tax_data_expanded[,c("Country", "CIT", "PIT")]
subset_6 = tax_data_expanded[,-3] #Removing only the third column and keep others
subset_7 = tax_data_expanded[c(3,4),1:3] #This will keep 3rd and 4th rows (Rwanda and Tanzania) and only 3 columns (Country name, CIT and PIT)
subset_7
Country CIT PIT
3 Rwanda 12.3 2.5
4 Tanzania 14.2 4.1
Method 1 - subset analysis
With this method, we only consider complete cases
complete.cases()
function. This function will drop all cases with missing ageAge_yrs = c(23,21,26,32,35,NA,34,32,40,NA,35,NA,23,23,34,NA,37,32,33,19,NA,20,23,NA,26,43,26,29,21,32,36,NA,NA,34,37,23,23,23,NA,27,32,33,32,36,NA,35,38,34,42,36,27,31,37,32,NA,28,26,NA,43,27,46,32,NA,NA,30)
text_dataset <- text_dataset[, !grepl("Age", names(text_dataset))] # First drop the column names containing Age
text_dataset = data.frame(text_dataset, Age_yrs)
text_dataset_dropna = text_dataset[complete.cases(text_dataset$Age_yrs), ]
nrow(text_dataset_dropna)
[1] 51
Method 2 - imputation (replacing missing values with mean or median)
With this method, the missing value are replaced by mean or median of valid numeric values
# impute NA with mean
text_dataset_imputena = text_dataset
text_dataset_imputena$Age_yrs = ifelse(is.na(text_dataset_imputena$Age_yrs), round(mean(text_dataset_imputena$Age_yrs, na.rm = TRUE), digits = 0), text_dataset_imputena$Age_yrs)
nrow(text_dataset_imputena)
[1] 65
# impute NA with median
text_dataset_imputena = text_dataset
text_dataset_imputena$Age_yrs = ifelse(is.na(text_dataset_imputena$Age_yrs), round(median(text_dataset_imputena$Age_yrs, na.rm = TRUE), digits = 0), text_dataset_imputena$Age_yrs)
nrow(text_dataset_imputena)
[1] 65
text_dataset_imputena = text_dataset
text_dataset_imputena$Age_yrs = ifelse(is.na(text_dataset_imputena$Age_yrs), mode(text_dataset_imputena$Age_yrs), text_dataset_imputena$Age_yrs)
nrow(text_dataset_imputena)
[1] 65
1. Summary stats (mean, median, mode, standard deviation)
#Mean
mean(text_dataset$Height)
[1] 155.1231
#Median
median(text_dataset$Height)
[1] 155
#Quartile
quantile(text_dataset$Height)
0% 25% 50% 75% 100%
136 147 155 165 174
#All summary stats
summary(text_dataset$Height)
Min. 1st Qu. Median Mean 3rd Qu. Max.
136.0 147.0 155.0 155.1 165.0 174.0
2. Cross tabulation
The following script extract a dataframe called UCBAdmission and produce summary statistics, then the contigency table (2 way table) for admission status VS gender. Furthermore, the percentages were computed.
admissions = data.frame(UCBAdmissions)
variable.names(admissions)
[1] "Admit" "Gender" "Dept" "Freq"
summary(admissions)
Admit Gender Dept Freq
Admitted:12 Male :12 A:4 Min. : 8.0
Rejected:12 Female:12 B:4 1st Qu.: 80.0
C:4 Median :170.0
D:4 Mean :188.6
E:4 3rd Qu.:302.5
F:4 Max. :512.0
crosstab1 = xtabs(Freq~Gender+Admit, data = admissions) # Crosstab for frequencies
prop.table(crosstab1,1) # Percentage (1 is used to get row %, while 2 is used for col percent)
Admit
Gender Admitted Rejected
Male 0.4451877 0.5548123
Female 0.3035422 0.6964578
3. Chi-square test for independence
Chi-square is the test for independence between 2 categorical variables. The null hypothesis for chi-square test assumes that each of the 2 categorical variable is independent from another. In our example, we are going to test the independency test between admission status and gender of the applicant. Remember the null hypothesis for Chi-square test assumes that each of the two variables is independent from the other (no relationship between the 2)
chisq.test(admissions$Admit, admissions$Gender)
Pearson's Chi-squared test
data: admissions$Admit and admissions$Gender
X-squared = 0, df = 1, p-value = 1
From our test results, we reject the null hypothesis, hence conclude that there is a statistical relationship between the 2.
4. Linear model and regression analysis
air_data = data.frame(airquality)
plot(air_data$Wind, air_data$Temp)
cor(air_data$Wind, air_data$Temp)
[1] -0.4579879
cor.test(air_data$Wind, air_data$Temp)
Pearson's product-moment correlation
data: air_data$Wind and air_data$Temp
t = -6.3308, df = 151, p-value = 2.642e-09
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.5748874 -0.3227660
sample estimates:
cor
-0.4579879
plot(air_data$Wind, air_data$Temp)
linear_model = lm(Temp~Wind, data = air_data)
abline((linear_model)$coefficient)
lines(lowess(air_data$Wind, air_data$Temp), col="blue")
summary(linear_model)
Call:
lm(formula = Temp ~ Wind, data = air_data)
Residuals:
Min 1Q Median 3Q Max
-23.291 -5.723 1.709 6.016 19.199
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 90.1349 2.0522 43.921 < 2e-16 ***
Wind -1.2305 0.1944 -6.331 2.64e-09 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 8.442 on 151 degrees of freedom
Multiple R-squared: 0.2098, Adjusted R-squared: 0.2045
F-statistic: 40.08 on 1 and 151 DF, p-value: 2.642e-09
The dplyr function is widely used in data management. The
#Start by installing the packages if not installed
#install.packages("dplyr")
#install.packages("nycflights13")
#install.packages("tidyverse)
library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
library(nycflights13)
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ forcats 1.0.0 ✔ readr 2.1.5
✔ ggplot2 3.5.2 ✔ stringr 1.5.1
✔ lubridate 1.9.4 ✔ tibble 3.3.0
✔ purrr 1.1.0 ✔ tidyr 1.3.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
flights_data = nycflights13::flights
#View(flights_data)
Filter The filter function is shown below:
#Filtering data
jan_flights = filter(flights_data, month==1) #interested in flights that took in Jan
dec_flights = filter(flights_data, month==12)
apr_flights = filter(flights_data, month==4)
ny_flights = filter(flights_data, month==1, day==1)
xmas_flights = filter(flights_data, month==12, day==25)
dec_jan_flights = filter(flights_data, month==12 | month==1)
jan_june_flights = filter(flights_data, month %in% c(1,2,3,4,5,6))
nrow(jan_flights)
[1] 27004
nrow(dec_flights)
[1] 28135
nrow(apr_flights)
[1] 28330
str(dec_jan_flights)
tibble [55,139 × 19] (S3: tbl_df/tbl/data.frame)
$ year : int [1:55139] 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
$ month : int [1:55139] 1 1 1 1 1 1 1 1 1 1 ...
$ day : int [1:55139] 1 1 1 1 1 1 1 1 1 1 ...
$ dep_time : int [1:55139] 517 533 542 544 554 554 555 557 557 558 ...
$ sched_dep_time: int [1:55139] 515 529 540 545 600 558 600 600 600 600 ...
$ dep_delay : num [1:55139] 2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
$ arr_time : int [1:55139] 830 850 923 1004 812 740 913 709 838 753 ...
$ sched_arr_time: int [1:55139] 819 830 850 1022 837 728 854 723 846 745 ...
$ arr_delay : num [1:55139] 11 20 33 -18 -25 12 19 -14 -8 8 ...
$ carrier : chr [1:55139] "UA" "UA" "AA" "B6" ...
$ flight : int [1:55139] 1545 1714 1141 725 461 1696 507 5708 79 301 ...
$ tailnum : chr [1:55139] "N14228" "N24211" "N619AA" "N804JB" ...
$ origin : chr [1:55139] "EWR" "LGA" "JFK" "JFK" ...
$ dest : chr [1:55139] "IAH" "IAH" "MIA" "BQN" ...
$ air_time : num [1:55139] 227 227 160 183 116 150 158 53 140 138 ...
$ distance : num [1:55139] 1400 1416 1089 1576 762 ...
$ hour : num [1:55139] 5 5 5 5 6 5 6 6 6 6 ...
$ minute : num [1:55139] 15 29 40 45 0 58 0 0 0 0 ...
$ time_hour : POSIXct[1:55139], format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...
Arrange function
help(arrange)
arrange(flights_data, carrier, year, month, day) #used to sort data based one one or multiple columns
# A tibble: 336,776 × 19
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
<int> <int> <int> <int> <int> <dbl> <int> <int>
1 2013 1 1 810 810 0 1048 1037
2 2013 1 1 1451 1500 -9 1634 1636
3 2013 1 1 1452 1455 -3 1637 1639
4 2013 1 1 1454 1500 -6 1635 1636
5 2013 1 1 1507 1515 -8 1651 1656
6 2013 1 1 1530 1530 0 1650 1655
7 2013 1 1 1546 1540 6 1753 1748
8 2013 1 1 1550 1550 0 1844 1831
9 2013 1 1 1552 1600 -8 1749 1757
10 2013 1 1 1554 1600 -6 1701 1734
# ℹ 336,766 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
# tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
# hour <dbl>, minute <dbl>, time_hour <dttm>
arrange(flights_data, desc(month), desc(day)) #descending ordering
# A tibble: 336,776 × 19
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
<int> <int> <int> <int> <int> <dbl> <int> <int>
1 2013 12 31 13 2359 14 439 437
2 2013 12 31 18 2359 19 449 444
3 2013 12 31 26 2245 101 129 2353
4 2013 12 31 459 500 -1 655 651
5 2013 12 31 514 515 -1 814 812
6 2013 12 31 549 551 -2 925 900
7 2013 12 31 550 600 -10 725 745
8 2013 12 31 552 600 -8 811 826
9 2013 12 31 553 600 -7 741 754
10 2013 12 31 554 550 4 1024 1027
# ℹ 336,766 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
# tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
# hour <dbl>, minute <dbl>, time_hour <dttm>
Select function (used when interested in specific variables)
selected_flights = select(flights_data, year, month, day) #only keep three variables (year, month, and day)
selected_flights = select(flights_data, (year:arr_time)) #keeping all columns between year and arr_time
selected_flights = select(flights_data, -(year:arr_time)) #dropping all variables from column year to arr_time
selected_flights
# A tibble: 336,776 × 12
sched_arr_time arr_delay carrier flight tailnum origin dest air_time
<int> <dbl> <chr> <int> <chr> <chr> <chr> <dbl>
1 819 11 UA 1545 N14228 EWR IAH 227
2 830 20 UA 1714 N24211 LGA IAH 227
3 850 33 AA 1141 N619AA JFK MIA 160
4 1022 -18 B6 725 N804JB JFK BQN 183
5 837 -25 DL 461 N668DN LGA ATL 116
6 728 12 UA 1696 N39463 EWR ORD 150
7 854 19 B6 507 N516JB EWR FLL 158
8 723 -14 EV 5708 N829AS LGA IAD 53
9 846 -8 B6 79 N593JB JFK MCO 140
10 745 8 AA 301 N3ALAA LGA ORD 138
# ℹ 336,766 more rows
# ℹ 4 more variables: distance <dbl>, hour <dbl>, minute <dbl>,
# time_hour <dttm>
Mutate function (used to create a new variable)
flights_data_2 = select(flights, year:day, ends_with("delay"), distance, ends_with("time"))
flights_data_3 = mutate(flights_data_2, gain=arr_delay-dep_delay, speed=round(distance/air_time*60,digits = 1))
flights_data_3
# A tibble: 336,776 × 13
year month day dep_delay arr_delay distance dep_time sched_dep_time
<int> <int> <int> <dbl> <dbl> <dbl> <int> <int>
1 2013 1 1 2 11 1400 517 515
2 2013 1 1 4 20 1416 533 529
3 2013 1 1 2 33 1089 542 540
4 2013 1 1 -1 -18 1576 544 545
5 2013 1 1 -6 -25 762 554 600
6 2013 1 1 -4 12 719 554 558
7 2013 1 1 -5 19 1065 555 600
8 2013 1 1 -3 -14 229 557 600
9 2013 1 1 -3 -8 944 557 600
10 2013 1 1 -2 8 733 558 600
# ℹ 336,766 more rows
# ℹ 5 more variables: arr_time <int>, sched_arr_time <int>, air_time <dbl>,
# gain <dbl>, speed <dbl>
Rename function (used for renaming variables)
renamed_flights_data = rename(flights_data_3, scheduled_dep_time = sched_dep_time, scheduled_arr_time = sched_arr_time)
names(renamed_flights_data)
[1] "year" "month" "day"
[4] "dep_delay" "arr_delay" "distance"
[7] "dep_time" "scheduled_dep_time" "arr_time"
[10] "scheduled_arr_time" "air_time" "gain"
[13] "speed"
group_by() and %>%
Used to display outputs by groups. “%>%” allows us to chain multiple operations together, improving code readability.
test_data = mtcars
group_by(test_data, cyl)
# A tibble: 32 × 11
# Groups: cyl [3]
mpg cyl disp hp drat wt qsec vs am gear carb
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 21 6 160 110 3.9 2.62 16.5 0 1 4 4
2 21 6 160 110 3.9 2.88 17.0 0 1 4 4
3 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1
4 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1
5 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2
6 18.1 6 225 105 2.76 3.46 20.2 1 0 3 1
7 14.3 8 360 245 3.21 3.57 15.8 0 0 3 4
8 24.4 4 147. 62 3.69 3.19 20 1 0 4 2
9 22.8 4 141. 95 3.92 3.15 22.9 1 0 4 2
10 19.2 6 168. 123 3.92 3.44 18.3 1 0 4 4
# ℹ 22 more rows
by_cyl <- mtcars %>% group_by(cyl)
#using %>%
test_data %>%
group_by(cyl) %>%
summarise(mean_weight = mean(wt)) %>%
arrange(cyl)
# A tibble: 3 × 2
cyl mean_weight
<dbl> <dbl>
1 4 2.29
2 6 3.12
3 8 4.00
#install package if not installed
#install.packages("ggplot2")
library(ggplot2)
cars_data = data.frame(mtcars)
table(cars_data$cyl)
4 6 8
11 7 14
#Bar plot
barplot(table(cars_data$cyl), main = "Car cylinders", col="orange", border="orange", ylab="Frequency", xlab="Number of cylinders")
#Dot plot
dotchart(cars_data$mpg, pch=3, labels = row.names(cars_data), cex=.5, main="Miles per gallon of the car model", xlab="MPG")
#Scatter plot
plot(cars_data$mpg~cars_data$wt, main="Automobile data", xlab="Weight", ylab="Miles per gallon", col="blue", pch=10)
ggplot(data=cars_data, aes(x=wt,y=mpg))+geom_point(size=1, col="blue")+labs(title = "Automobile data", x="Weight", y="Miles per gallon")+geom_smooth(method = "lm", color="orange")
`geom_smooth()` using formula = 'y ~ x'
#Comparision chart
cars_data$am = factor(cars_data$am, levels = c(0,1), labels = c("Automatic", "Manual"))
cars_data$vs = factor(cars_data$vs, levels = c(0,1), labels = c("V-Engine", "Straight Engine"))
cars_data$cyl = factor(cars_data$cyl)
ggplot(data=cars_data, aes(x=hp,y=mpg, shape=cyl, color=cyl))+geom_point(size=2)+facet_grid(am~vs)+labs(title = "Automobile data by engine type", x="HorsePower", y="Miles per gallon")
basic_gg_plot = ggplot(data=cars_data, aes(x=wt,y=mpg))+geom_point(size=1, col="blue")+labs(title = "Automobile data", x="Weight", y="Miles per gallon")
geom_hline() adds a horizontal line. The plot below
presents an average milles per gallow
ggplot(data=cars_data, aes(x=wt,y=mpg))+geom_point(size=1, col="blue")+labs(title = "Automobile data", x="Weight", y="Miles per gallon")+geom_hline(yintercept = mean(cars_data$mpg), linetype = "dashed", color = "red") + geom_text(aes(x = 5.2, y = 21.4, label = "Average MPG"), color = "red")
Warning in geom_text(aes(x = 5.2, y = 21.4, label = "Average MPG"), color = "red"): All aesthetics have length 1, but the data has 32 rows.
ℹ Please consider using `annotate()` or provide this layer with data containing
a single row.
The geom_jitter() function is used to create a jittered scatter plot. It adds a small amount of random noise to the data points along the x-axis or y-axis, helping to visualize the distribution of points when they overlap.
ggplot(data=cars_data, aes(x=wt,y=mpg))+geom_point(size=1, col="blue")+labs(title = "Automobile data", x="Weight", y="Miles per gallon")+geom_jitter(width = 0.2, height = 0.2, color = "grey") +
theme_minimal()
ggplot(data=cars_data, aes(x=wt,y=mpg))+geom_point(size=1, col="blue")+labs(title = "Automobile data", x="Weight", y="Miles per gallon")+geom_jitter(width = 0.2, height = 0.2, color = "grey") +
theme_minimal()
help(geom_rug)
The geom_line() function is used to add lines to a plot.
It connects the data points in the order they appear in the data frame,
creating a line chart.
ggplot(data=cars_data, aes(x=wt,y=mpg))+geom_point(size=2, col="blue")+labs(title = "Automobile data", x="Weight", y="Miles per gallon")+geom_line(color = "blue")
The geom_rug() function is used to add marginal rugs to
a plot. Rugs are small lines or ticks that extend along the axes,
indicating the distribution of data points along a particular axis.
geom_rug() can be used with either the x-axis or y-axis, or both.
ggplot(data=cars_data, aes(x=wt,y=mpg))+geom_point(size=2, col="blue")+labs(title = "Automobile data", x="Weight", y="Miles per gallon")+geom_rug(sides = "bl")
Functions in R
f = function(x) x^2
formals(f)
$x
body(f)
x^2
f(5)
[1] 25
p_values = c(0.03,0.034,0.23,0.001,0.08)
significance = ifelse(p_values < 0.05, "Significant", "Not significant")
significance
[1] "Significant" "Significant" "Not significant" "Significant"
[5] "Not significant"
Generating random numbers following a given distribution
runif(10,0,20) #this function will generate 10 observations within the interval [0,20], and follows uniform distribution
[1] 7.868876 8.083936 13.817482 4.523355 19.844741 3.227845 5.281990
[8] 6.744382 3.828884 18.001967
rpois(10, 12) #this will generate 10 values following a poison distribution with lambda=12
[1] 11 10 11 17 9 12 11 17 8 14
ran_data = runif(10,0,20)
apply() and lapply functions
apply(data, margin, function) Data stands for your dataset, margin (1 stands for rows, 2 stands for columns), function: eg mean, sum,…
lapply() function is used when you have a custom function
Titanic data
str(Titanic)
'table' num [1:4, 1:2, 1:2, 1:2] 0 0 35 0 0 0 17 0 118 154 ...
- attr(*, "dimnames")=List of 4
..$ Class : chr [1:4] "1st" "2nd" "3rd" "Crew"
..$ Sex : chr [1:2] "Male" "Female"
..$ Age : chr [1:2] "Child" "Adult"
..$ Survived: chr [1:2] "No" "Yes"
apply(Titanic, c(3,2,4), sum)
, , Survived = No
Sex
Age Male Female
Child 35 17
Adult 1329 109
, , Survived = Yes
Sex
Age Male Female
Child 29 28
Adult 338 316
summary(Titanic)
Number of cases in table: 2201
Number of factors: 4
Test for independence of all factors:
Chisq = 1637.4, df = 25, p-value = 0
Chi-squared approximation may be incorrect
numbers = list(a = 1, b = 2, c = 3)
sapply(numbers, function(x) x * 2)
a b c
2 4 6
#The vapply function is similar to sapply, but it allows you to specify the type of the result. This can be useful when you want to ensure the result has a specific structure.
vapply(numbers, function(x) x * 2, numeric(1))
a b c
2 4 6
T-test is used to perform one and two sample t-tests on vectors of data.
t.test(mpg~vs, data=mtcars)
Welch Two Sample t-test
data: mpg by vs
t = -4.6671, df = 22.716, p-value = 0.0001098
alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
95 percent confidence interval:
-11.462508 -4.418445
sample estimates:
mean in group 0 mean in group 1
16.61667 24.55714
str(women)
'data.frame': 15 obs. of 2 variables:
$ height: num 58 59 60 61 62 63 64 65 66 67 ...
$ weight: num 115 117 120 123 126 129 132 135 139 142 ...
fit = lm(weight~height, data=women)
plot(resid(fit))
fit
Call:
lm(formula = weight ~ height, data = women)
Coefficients:
(Intercept) height
-87.52 3.45
abline(fit$coefficient)
round(fitted(fit), digits = 0)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
113 116 119 123 126 130 133 137 140 144 147 151 154 157 161
summary(fit)
Call:
lm(formula = weight ~ height, data = women)
Residuals:
Min 1Q Median 3Q Max
-1.7333 -1.1333 -0.3833 0.7417 3.1167
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -87.51667 5.93694 -14.74 1.71e-09 ***
height 3.45000 0.09114 37.85 1.09e-14 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.525 on 13 degrees of freedom
Multiple R-squared: 0.991, Adjusted R-squared: 0.9903
F-statistic: 1433 on 1 and 13 DF, p-value: 1.091e-14
fit2 = lm(weight ~ poly(height, degree = 2, raw = TRUE), data = women)
summary(fit2)
Call:
lm(formula = weight ~ poly(height, degree = 2, raw = TRUE), data = women)
Residuals:
Min 1Q Median 3Q Max
-0.50941 -0.29611 -0.00941 0.28615 0.59706
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 261.87818 25.19677 10.393 2.36e-07 ***
poly(height, degree = 2, raw = TRUE)1 -7.34832 0.77769 -9.449 6.58e-07 ***
poly(height, degree = 2, raw = TRUE)2 0.08306 0.00598 13.891 9.32e-09 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.3841 on 12 degrees of freedom
Multiple R-squared: 0.9995, Adjusted R-squared: 0.9994
F-statistic: 1.139e+04 on 2 and 12 DF, p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(fit)
plot(fit2)
fit3 = lm(weight~height+I(height^2), data=women)
par(mfrow = c(2,2))
plot(fit3)
#Multiple regression
air_reg = lm(Temp~Wind+Ozone, data=airquality)
summary(air_reg)
Call:
lm(formula = Temp ~ Wind + Ozone, data = airquality)
Residuals:
Min 1Q Median 3Q Max
-21.487 -5.137 1.220 4.674 13.102
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 74.18035 2.96526 25.017 < 2e-16 ***
Wind -0.37829 0.22080 -1.713 0.0894 .
Ozone 0.17615 0.02393 7.362 3.15e-11 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 6.762 on 113 degrees of freedom
(37 observations deleted due to missingness)
Multiple R-squared: 0.5007, Adjusted R-squared: 0.4918
F-statistic: 56.65 on 2 and 113 DF, p-value: < 2.2e-16
library(MASS)
Attaching package: 'MASS'
The following object is masked from 'package:dplyr':
select
data("UScrime")
head(UScrime)
M So Ed Po1 Po2 LF M.F Pop NW U1 U2 GDP Ineq Prob Time y
1 151 1 91 58 56 510 950 33 301 108 41 394 261 0.084602 26.2011 791
2 143 0 113 103 95 583 1012 13 102 96 36 557 194 0.029599 25.2999 1635
3 142 1 89 45 44 533 969 18 219 94 33 318 250 0.083401 24.3006 578
4 136 0 121 149 141 577 994 157 80 102 39 673 167 0.015801 29.9012 1969
5 141 0 121 109 101 591 985 18 30 91 20 578 174 0.041399 21.2998 1234
6 121 0 110 118 115 547 964 25 44 84 29 689 126 0.034201 20.9995 682
## independent t test
t.test(Prob~So, data=UScrime)
Welch Two Sample t-test
data: Prob by So
t = -3.8954, df = 24.925, p-value = 0.0006506
alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
95 percent confidence interval:
-0.03852569 -0.01187439
sample estimates:
mean in group 0 mean in group 1
0.03851265 0.06371269
## Paired test
# t.test(y1,y2,paired=TRUE)
sapply(UScrime[c("U1","U2")],function(x)(c(mean=mean(x),sd=sd(x))))
U1 U2
mean 95.46809 33.97872
sd 18.02878 8.44545
with(UScrime,t.test(U1,U2,paired=TRUE))
Paired t-test
data: U1 and U2
t = 32.407, df = 46, p-value < 2.2e-16
alternative hypothesis: true mean difference is not equal to 0
95 percent confidence interval:
57.67003 65.30870
sample estimates:
mean difference
61.48936
#Wilcoxon
wilcox.test(Prob~So, data=UScrime)
Wilcoxon rank sum exact test
data: Prob by So
W = 81, p-value = 8.488e-05
alternative hypothesis: true location shift is not equal to 0
#Kruskal wallis
states<-data.frame(state.region,state.x77)
kruskal.test(Illiteracy~state.region,data=states)
Kruskal-Wallis rank sum test
data: Illiteracy by state.region
Kruskal-Wallis chi-squared = 22.672, df = 3, p-value = 4.726e-05
Generating random numbers
x <- rnorm(10)
x <- rnorm(10, 20, 2)
set.seed(20)
x <- rnorm(100)
e <- rnorm(100, 0, 2)
y <- 0.5 + 2 * x + e
plot(x, y)
country<-c("Australia", "Austria", "Belgium","Canada", "Denmark", "Finland","France","Germany",
"Greece","Ireland","Italy","Japan", "Netherland",
"New Zealand", "Norway", "Portugal", "Spain","Sweden", "Switzerland","UK","USA")
Income.inequality<- c(7.0,4.8,4.6,5.6,4.3,3.7,5.6,5.2,6.2,6.0,6.7,3.4,5.3,6.8,3.9,8.0,5.5,4.0,5.7 ,7.2,8.6)
Index.HS<-c(0.07,0.01,-0.23,-0.07,-0.19,-0.43,0.05,-0.06,0.38,0.25,-0.12,- 1.26,-0.51,0.29,-0.63
,1.18,-0.30,-0.83,-0.46,0.79,2.02)
GDP_WB<- c(45926,47682,43435,45066,45537,30676,39328,46401,26851,49393,35463,36319,48253,37679,65615
,28760,33629,45297,59540,40233,54630)
data.21<-data.frame(country,Income.inequality,Index.HS,GDP_WB)
plot(data.21[c("Income.inequality","Index.HS")])
Index_inequality.df<-data.21[c("Income.inequality","Index.HS")]
str(Index_inequality.df)
'data.frame': 21 obs. of 2 variables:
$ Income.inequality: num 7 4.8 4.6 5.6 4.3 3.7 5.6 5.2 6.2 6 ...
$ Index.HS : num 0.07 0.01 -0.23 -0.07 -0.19 -0.43 0.05 -0.06 0.38 0.25 ...
(country<-data.21[,"country"])
[1] "Australia" "Austria" "Belgium" "Canada" "Denmark"
[6] "Finland" "France" "Germany" "Greece" "Ireland"
[11] "Italy" "Japan" "Netherland" "New Zealand" "Norway"
[16] "Portugal" "Spain" "Sweden" "Switzerland" "UK"
[21] "USA"
(country.2<-data.21["country"])
country
1 Australia
2 Austria
3 Belgium
4 Canada
5 Denmark
6 Finland
7 France
8 Germany
9 Greece
10 Ireland
11 Italy
12 Japan
13 Netherland
14 New Zealand
15 Norway
16 Portugal
17 Spain
18 Sweden
19 Switzerland
20 UK
21 USA
str(country)
chr [1:21] "Australia" "Austria" "Belgium" "Canada" "Denmark" "Finland" ...
str(country.2)
'data.frame': 21 obs. of 1 variable:
$ country: chr "Australia" "Austria" "Belgium" "Canada" ...
plot(Index_inequality.df,pch=20)
text(Index_inequality.df,labels=country, cex=.75, pos=1)
#Handling overlapping texts
which(country %in% c("Austria","Denmark","Germany","Netherland"))
[1] 2 5 8 13
text.left<-which(country %in% c("Austria","Denmark","Germany","Netherland"))
text.left
[1] 2 5 8 13
text.right<-setdiff(1:nrow(data.21),text.left)
text.right
[1] 1 3 4 6 7 9 10 11 12 14 15 16 17 18 19 20 21
pos.text<-ifelse(1:nrow(data.21)%in% text.left,2,4)
plot(Index_inequality.df,pch=20,col="orange",xlim=c(3,9),ylim=c(-1.5,2.5))
text(Index_inequality.df,labels=country,pos=pos.text,cex=0.7)
text.up<-which(country %in% "Germany")
text.up
[1] 8
text.left<-setdiff(1:nrow(data.21),c(text.right,text.up))
text.left
[1] 2 5 13
pos.text<-ifelse(1:nrow(data.21) %in% text.up,3,ifelse(1:nrow(data.21)%in% text.left,2,4))
plot(Index_inequality.df,pch=20,col="orange",xlim=c(3,9),ylim=c(- 1.5,2.5),ann=FALSE)
text(Index_inequality.df,labels = country,pos=pos.text,cex=0.7)
main.title<-"Income inequality vs Index of Health and Social problems"
x.lab<-"Income inequality (5th Ratio)"
y.lab<-"Index of Health and Social problems"
title(main=main.title,xlab=x.lab,ylab=y.lab)
title(main=main.title,xlab=x.lab,ylab=y.lab)
mtext(c("Better","Worse"),side=2,at=c(-1.8,2.8),las=1)
text(x=5,y=1.5,labels=paste("r=",round(cor(Index_inequality.df[1],Index_inequality.df[2]),digits=2)))