Things to know about R:
Things to know about RStudio:
CRAN, the official R repository.
CRAN is a network of servers (called mirrors) around the world.ggplot2:install.packages("package_name1").install.packages("package_name1", "package_name2").To install R packages from RStudio, follow these steps:
Tools from the menu bar and then click on Install Packages…:Packages text box and click on the Install button:There is also a list of common problems when installing packages available on the RStudio support page at \(\Rightarrow\) Click me.
R can be used to do basic math.right angle bracket (>) where code should be entered.2 + 3
3 * 5 * 7
5/2
10/5
4 + 6 * 5
2 * (3 + 5)
<- and =, with the first being preferred.x <- 2
x
2 -> x
x
x = 2
x
u <- v <- 7
u
v
rm() function.y <- 15
y
rm(y)
y # object 'y' not found
NumericCharacter (String)Date (time-based)logical (TRUE/FALSE)class().is.numeric().is.integer().x <- 25
class(x)
is.numeric(x)
x <- 25L
class(x)
is.integer(x)
is.numeric(x)
x <- "Missouri"
x
class(x)
nchar(x)
y <- factor(x)
y
class(y)
nchar(y) # Error in nchar(y) : 'nchar()' requires a character vector
TRUE or FALSE.TRUE is the same as 1 and FALSE is the same as 0.TRUE
T
FALSE
F
10 * TRUE
10 * FALSE
x <- TRUE
class(x)
is.logical(x)
# does 7 equal 10?
7 == 10
# does 10 not equal 7?
10 != 7
# is 7 less than 10?
7 < 10
# is 7 less than or equal to 10?
7 <= 10
# is 7 greater than 10?
7 > 10
# is 7 greater than or equal to 10?
7 >= 10
c(2, 1, 5, 10, -9) is a vector of numbers.c("high", "medium", "low", "unknown") is a vector of characters.c. Thec means combine.dat <- c(2.24, 2.05, 1.76, 2.43, 1.75, 1.54, 1.84, 1.94, 1.64, 1.50)
dat
dat[2]
dat[1:3]
dat[-(1:3)]
dat[c(1,5,8)]
dat[-c(1,5,8)]
length(dat)
x <- 1:10
length(x)
dat - 0.5
dat + 3.2
dat/3
dat^2
sqrt(dat)
length(sqrt(dat))
dat == x
dat < x
dat >= x
dat != x
as.factor() to convert a character vector to a factor vector.fac2, R also prints the levels.
as.numeric().fac1 <- c("Hockey", "Football", "Baseball", "Curling", "Rugby",
"Lacrosse", "Basketball", "Tennis", "Cricket", "Soccer")
fac2 <- as.factor(fac1)
fac2
as.numeric(fac2)
# Relevel
fac3 <- relevel(fac2, ref = "Soccer")
fac3
mean(dat)
median(dat)
?mean.is.na tests each element of a vector for missingness.x <- c(2, 3, 5, 7, NA, 5, NA, NA)
x
mean(x)
mean(x, na.rm = TRUE)
is.na(x)
z <- c("Male", NA, "Female")
z
is.na(z)
mi, mice and Amelia packages.magrittr package functions by taking the value (object) on the left-hand side of the pipe and inserting it into the first argument of the function that is on the right-hand side of the pipe.
library(magrittr)
z <- c(2, 5, 9, 3, 7)
z %>% mean(na.rm = TRUE)
mean(z)
u <- c(1, 2, NA, 8, 3, NA, 3, NA, NA, 15)
u %>% is.na %>% sum
sum(is.na(u))
a <- 20:11
b <- -12:-3
c <- c("Hockey", "Football", "Baseball", "Curling", "Rugby",
"Lacrosse", "Basketball", "Tennis", "Cricket", "Soccer")
DF <- data.frame(Col_1 = a, Col_2 = b, Sport = c)
DF
names(DF)
names(DF)[3]
class(DF)
nrow(DF)
ncol(DF)
dim(DF)
head(DF)
tail(DF, 3)
DF$Sport
DF[3,]
DF[, 2]
DF[3, 2]
DF[3, 2:3]
DF[3, 2:3]
DF[c(3, 5), 2]
DF[c(3, 5), 2:3]
DF[c(3, 5), c(2, 3)]
DF[, c("Col_1", "Sport")]
DF[, "Sport"]
# create a 5x2 matrix
A <- matrix(1:10, nrow=5, byrow = FALSE)
A
nrow(A); ncol(A); dim(A)
# create another 5x2 matrix
B <- matrix(21:30, nrow=5)
nrow(B); ncol(B); dim(B)
# create another 5x2 matrix
C <- matrix(21:40, nrow=2)
nrow(C); ncol(C); dim(C)
# add them
A + B
# multiply them
A * B
colnames(A) <- c("Left", "Right")
rownames(A) <- c("1st", "2nd", "3rd", "4th", "5th")
colnames(B) <- c("First", "Second")
rownames(B) <- c("One", "Two", "Three", "Four", "Five")
colnames(C) <- LETTERS[1:10]
rownames(C) <- c("Top", "Bottom")
A; B; C
# matrix transpose
t(A); t(C)
# matrix multiplication
A %*% t(B)
A %*% C
# creates a two element list.
list1 <- list(DF, A)
list1
# creates a four element list.
list2 <- list(DF, A, B, C)
list2
Arry <- array(1:18, dim=c(2, 3, 3))
Arry
Arry[1, , 1]
Arry[, 2, 3]
Arry[1, 2, 2]
Setting Working Directory
working directory.
data files are located.working directory.
working directory via point-and-click:
Working Directory” under More.setwd() function; path must be in quotes.Excel, Stata, SPSS, SAS, CSV, JSON, fixed-width, TXT, DAT etc.
The easiest way to read data from a CSV file is to use read.table(). Most people prefer to use read.csv() which is a rapper around read.table() with the sep argument preset to a comma (,).
The outcome of using read.table is a data.frame.
We will learn to import a CSV file from your local computer into R using the credit data set.
# Set your working directory to the folder on the computer that
# contains the credit data set.
setwd("C:/Users/ethom/Dropbox/data")
# Read data into R using the read.table() function.
dat_1 <- read.table("credit.csv", header=TRUE, sep=",", stringsAsFactors=FALSE)
dat_1
# Read data into R using the read.csv() function.
dat_1 <- read.csv("credit.csv", header = TRUE, stringsAsFactors = TRUE)
dat_1
theUrl <- "https://stats.idre.ucla.edu/wp-content/uploads/2016/02/test-1.csv"
test.tab <- read.table(file=theUrl, header=TRUE, sep=",", stringsAsFactors=FALSE)
head(test.tab)
test.txt <- read.table("https://stats.idre.ucla.edu/wp-content/uploads/2016/02/test.txt", header=T)
head(test.txt)
read.table(), but there are other alternatives that can read large files into memory.read_delim() from the readr package by Hadley Wickham.fread() from the data.table package by Matt Dowle respectively.read_delim(), and all the data-reading functions in readr, return a tibble, which is an extension of data.frame.library(readr)
theUrl <- "https://stats.idre.ucla.edu/wp-content/uploads/2016/02/test-1.csv"
test.del <- read_delim(file=theUrl, delim=',')
head(test.del)
class(test.del)
read_csv(), read_csv2() and read_tsv() are special cases for when the delimiters are commas (,), semicolons (;) and tabs (\t), respectively.fread() results in a data.table object which is an extension of data.frame.library(data.table)
theUrl <- "https://stats.idre.ucla.edu/wp-content/uploads/2016/02/test-1.csv"
test.fre <- fread(input=theUrl, sep=',', header=TRUE)
head(test.fre)
class(test.fre)
read_delim() or fread() are fast but the decision of which one to use depends upon whether dplyr or data.table is preferred for data manipulation.# Use readxl package to read xls|xlsx
library("readxl")
my_data <- read_excel(file, sheet = "------") # For sheet specify an index or name.
# Use xlsx package
library("xlsx")
my_data <- read.xlsx(file, sheetIndex, header = TRUE)
foreign package has a number of functions like read.table to read in data from other tools.| Function | Format |
|---|---|
| read.spss | SPSS |
| read.dta | Stata |
| read.ssd | SAS |
| read.octave | Octave |
| read.mtp | Minitab |
| read.systat | System |
haven() optimized for speed written by Hadley Wickham but results in tibble rather than data.frame can also be used to read data from some standard statistical software.data():data()
data(Seatbelts)
head(Seatbelts, 5)
data(iris)
head(iris, 5)
write.table(), write.csv().readr function: write_tsv(), write_csv(). From xlsx package we use the function write.xlsx() for Excel files.data(iris)
write.csv(iris, file = "iris1.csv")
library(readr)
write_csv(iris, path = "iris2.csv")
head() for first few rows of a matrix or data frame.tail() for last few rows of a matrix or data frame.dim() for dimension of a matrix or data frame.str() for displaying the structure of an R object.nrow() for number of rows of a matrix or data frame.ncol() for number of columns of a matrix or data frame.summary() for numeric variables.quantile() for quartiles.table() for categorical variables.sum(is.na()) for counting the number of NAs in the entire dataset.If you need to change the data type for any column, use the following functions:
as.character() converts to a text string.as.numeric() converts to a number.as.factor() converts to a categorical variable.as.integer() converts to an integer.url_credit <- "https://raw.githubusercontent.com/sylvadon5/data-files/main/credit.csv"
credit_data <- read.csv(url_credit, header = TRUE, stringsAsFactor = FALSE)
# credit_data <- read.csv("data/credit.csv", header = TRUE, stringsAsFactor = FALSE)
dim(credit_data)
[1] 1000 17
str(credit_data)
'data.frame': 1000 obs. of 17 variables:
$ checking_balance : chr "< 0 DM" "1 - 200 DM" "unknown" "< 0 DM" ...
$ months_loan_duration: int 6 48 12 42 24 36 24 36 12 30 ...
$ credit_history : chr "critical" "good" "critical" "good" ...
$ purpose : chr "furniture/appliances" "furniture/appliances" "education" "furniture/appliances" ...
$ amount : int 1169 5951 2096 7882 4870 9055 2835 6948 3059 5234 ...
$ savings_balance : chr "unknown" "< 100 DM" "< 100 DM" "< 100 DM" ...
$ employment_duration : chr "> 7 years" "1 - 4 years" "4 - 7 years" "4 - 7 years" ...
$ percent_of_income : int 4 2 2 2 3 2 3 2 2 4 ...
$ years_at_residence : int 4 2 3 4 4 4 4 2 4 2 ...
$ age : int 67 22 49 45 53 35 53 35 61 28 ...
$ other_credit : chr "none" "none" "none" "none" ...
$ housing : chr "own" "own" "own" "other" ...
$ existing_loans_count: int 2 1 1 1 2 1 1 1 1 2 ...
$ job : chr "skilled" "skilled" "unskilled" "skilled" ...
$ dependents : int 1 1 2 2 2 2 1 1 1 1 ...
$ phone : chr "yes" "no" "no" "no" ...
$ default : chr "no" "yes" "no" "no" ...
dplyr package from the tidyverse packages to manipulate data.dplyr:
select: Choose which columns to include.filter: Filter the data. group_by: Group the data by a categorical variable.summarize: Summarize, or aggregate (for each group if following group_by). Often used in conjunction with functions including: mean, median, max, min, sum, n etc.mutate: Create new column(s) in the data, or change existing column(s).These functions can be chained together using the operator %>% which makes the output of one line of code the input for the next.
filtering include:
x < y (less than)x > y (greater than)x <= y (less than or equal to)x >= y (greater than or equal to)x == y (equal)x != y (not equal)! x (NOT operator)x & y (AND operator)x | y (OR operator)Gapminder data: Excerpt of the Gapminder data on life expectancy, GDP per capita, and population by country
# install.packages("gapminder")
library(gapminder)
library(tidyverse)
filter_1 <- filter(gapminder, country == "United States")
filter_2 <- filter(gapminder, country != "United States")
filter_3 <- filter(gapminder, pop < 1000000)
filter_4 <- filter(gapminder, pop < 1000000 | year == 2007)
filter_5 <- filter(gapminder, country == "United States") %>%
filter(lifeExp >= 66 & lifeExp <= 80)
filter_6 <- filter(gapminder, pop < 1000000 & year != 2007)
filter_7 <- filter(gapminder, country %in% c("United States", "Canada")) %>%
filter(year > 2000) %>%
filter(pop > 100000) %>%
filter(lifeExp >= 18)
filter_8 <- filter(gapminder, !continent %in% c("Asia", "Europe", "America")) %>%
filter(year > 2000) %>%
filter(pop > 100000) %>%
filter(lifeExp >= 18)
select_1 <- select(gapminder, -country)
select_2 <- select(gapminder, lifeExp, pop, gdpPercap, continent)
select_2 <- select(gapminder, c(lifeExp, pop, gdpPercap, continent))
select_2 <- select(gapminder, c("lifeExp", "pop", "gdpPercap", "continent"))
select_3 <- select(gapminder, -c(lifeExp, pop, gdpPercap, continent))
select_3 <- select(gapminder, -c("lifeExp", "pop", "gdpPercap", "continent"))
filter_select_1 <- filter(gapminder, year == 2007) %>%
select(country, year, lifeExp)
filter_select_2 <- filter(gapminder, country == "United States" | country == "Canada",
year > 2000) %>%
select(country, year, lifeExp)
mutate_1 <- mutate(gapminder, popMil = round(pop / 1000000, 1))
mutate_2 <- mutate(gapminder, popMil = round(pop / 1000000, 1)) %>%
mutate(Log_lifeExp = log(lifeExp))
group_year <- group_by(gapminder, year)
group_continent <- group_by(gapminder, continent)
# Needed to get higher moments like skewness/kurtosis
library(moments)
group_1 <- group_by(gapminder, continent) %>%
summarize(mean = mean(lifeExp),
stdev = sd(lifeExp),
median = median(lifeExp),
min = min(lifeExp),
max = max(lifeExp),
n = n(),
se = stdev/sqrt(n),
skew = skewness(lifeExp),
kur = kurtosis(lifeExp))
group_2 <- group_by(gapminder, year) %>%
summarise(mean = mean(lifeExp),
stdev = sd(lifeExp),
median = median(lifeExp),
min = min(lifeExp),
max = max(lifeExp),
n = n(),
se = stdev/sqrt(n),
skew = skewness(lifeExp),
kur = kurtosis(lifeExp))
sumr_1 <- summarise(gapminder,
mean = mean(lifeExp),
stdev = sd(lifeExp),
median = median(lifeExp),
min = min(lifeExp),
max = max(lifeExp),
n = n(),
se = stdev/sqrt(n),
skew = skewness(lifeExp),
kur = kurtosis(lifeExp))
sumr_2 <- summarise(gapminder,
mean = mean(gdpPercap),
stdev = sd(gdpPercap),
median = median(gdpPercap),
min = min(gdpPercap),
max = max(gdpPercap),
n = n(),
se = stdev/sqrt(n),
skew = skewness(gdpPercap),
kur = kurtosis(gdpPercap))
# Bind to create a Data frame
dd <- round(as.data.frame(rbind(sumr_1, sumr_2)), 3)
rownames(dd) <- c("Life Expectancy", "GDP Per Capita")
dd
Scatter Plot
URL <- "https://raw.githubusercontent.com/sylvadon5/data-files/main/credit.csv"
credit_data <- read.csv(URL, header = TRUE)
plot(credit_data$age, credit_data$amount,
xlab = "Age", ylab = "Amount", main = "Amount vs Age",
pch = 25, col = "red")
Histogram
hist(credit_data$age, xlab = "Age", ylab = "Frequency", main = "Histogram",
freq = TRUE, col = "purple", border = "red", breaks = 7)
grid()
Density Plot
plot(density(credit_data$amount), col = "green",
xlab = "Amount", main = "Density", lwd = 5)
grid()
Boxplot
boxplot(credit_data$amount~credit_data$job, xlab = "Job", ylab = "Amount",
main = "Boxplot", horizontal = FALSE, col = "blue")
grid()
Bar Graph
frequency <- table(credit_data$years_at_residence)
barplot(frequency, xlab = "Default Status", main = "Default",
ylim = c(0, 500))
ggplot(<DATA>, mapping = aes(<MAPPINGS>)) +
<GEOM_FUNCTION>()
DATA: Data set containing the variables to be used for plotting.aes: Stands for “Aesthetic”. Function that defines the variables to be plotted and other plotting characteristics such as color, shape, size etc.GEOM_FUNCTION: Defines how the data is to be represented in the plot. Popular GEOM_FUNCTIONS include:
geom_point() for scatter plots.geom_boxplot() for boxplots.geom_histogram() for histograms.geom_bar() for bar graphs.GEOM_FUNCTION to the plot, we use + operator.+ operator can also be used to add other layers such as labs() to the plot.Scatter Plot
library(tidyverse)
ggplot(gapminder, aes(x = lifeExp, y = gdpPercap)) +
geom_point(shape = 25, size = 2, color = "blue") +
labs(title = "GDP Per Capita vs Life Expectancy", x = "Years", y = "Dollars")
ggplot(gapminder, aes(x = lifeExp, y = gdpPercap, color = continent)) +
geom_point(shape = 1, size = 2) +
labs(title = "GDP Per Capita vs Life Expectancy", x = "Years", y = "Dollars")
facet_wrap(): One categorical variable.facet_grd(): Two categorical variables.
ggplot(gapminder, aes(x = lifeExp, y = gdpPercap)) +
geom_point(shape = 2, size = 1, color = "purple") +
facet_wrap(vars(continent)) +
labs(title = "GDP Per Capita vs Life Expectancy", x = "Years", y = "Dollars")
Histogram
ggplot(gapminder, aes(x = lifeExp)) +
geom_histogram(aes(y = ..count..), binwidth = 8, col = "green", fill = "red") +
labs(title="Histogram", x="Life Expectancy", y="Frequency")
ggplot(gapminder, aes(x = lifeExp)) +
geom_histogram(aes(y = ..density..), binwidth = 8) +
facet_wrap(vars(continent)) +
labs(title= "Relative Frequency Histogram", x= "Life Expectancy", y= " Relative Frequency")
ggplot(gapminder, aes(x = continent, y = lifeExp)) +
geom_boxplot(col = "red", fill = "green") +
labs(title = "Boxplot", x = "Continent", y = "Life Expectancy")
ggplot(gapminder, aes(x = continent, y = lifeExp)) +
geom_boxplot(fill = "blue", col = "brown") +
facet_wrap(vars(year)) +
labs(title = "Boxplot", x = "Continent", y = "Life Expectancy")
Bar chart
ggplot(gapminder, aes(x = continent)) +
geom_bar(fill = "snow") +
coord_flip() +
labs(title="Frequency Distribution of Continents", x="Continents", y="Frequency")
ggplot(gapminder, aes(x = continent)) +
geom_bar(fill = "darkgrey") +
labs(title="Frequency Distribution of Continents", x="Continents", y="Frequency")
# Normal Random Values
x <- rnorm(n = 5, mean = 2, sd = 3)
x
# Standard Normal Random Values
x <- rnorm(n = 5, mean = 0, stdev = 1)
x
# Student t Random Values
t <- rt(n = 5, df = 20)
t
# Uniform Random Values
y <- runif(n = 5, min = 1, max = 6)
y
# Binomial Random Values
z <- rbinom(m= 5, n = 10, p = 0.5)
z
# Poisson Random Values
m <- rpois(n = 5, mean = 3)
m
# Simulated Data
dat_c <- data.frame(x1 = rnorm(10, 0, 1), x2 = runif(10, 0, 1), y = rnorm(10, 3, 0.2))
dat_c
Descriptive Statistics
library(gapminder)
library(psych)
# Descriptive Statistics
describe(gapminder$lifeExp)
# Descriptive Statistics by Group
describeBy(gapminder$lifeExp, gapminder$continent)
Correlation
# Syntax
cor(x, method = c("pearson", "kendall", "spearman"))
cor(gapminder$lifeExp, gapminder$gdpPercap)
cor(gapminder[, c("lifeExp", "pop", "gdpPercap")])
library("Hmisc")
corr_matrix <- rcorr(as.matrix(gapminder[, c("lifeExp", "pop", "gdpPercap")]))
corr_matrix
install.packages("rmarkdown", dependencies=TRUE)install.packages("tidyverse", dependencies=TRUE)install.packages("tinytex", dependencies=TRUE)tinytex::install_tinytex()File | New File | R Markdown.TRUE or FALSE.if, else, ifelse and switch.==, <, >, <=, >=, and !=.as.numeric(TRUE)
as.numeric(FALSE)
2 == 2
2 > 2
2 >= 2
5 != 3
# set up a variable to hold 1
toCheck <- 1
# if toCheck is equal to 1, print hello
if(toCheck == 1)
{
print("hello")
}
# first create the function
check.bool <- function(x)
{
if(x==1)
{
# if the input is equal to 1, print hello
print("hello")
}else
{
# otherwise print goodbye
print("goodbye")
}
}
check.bool(1)
check.bool(3)
check.bool <- function(x)
{
if(x == 1)
{
# if the input is equal to 1, print hello
print("hello")
}else if(x == 0)
{
# if the input is equal to 0, print goodbye
print("goodbye")
}else
{
# otherwise print confused
print("confused")
}
}
check.bool(1)
check.bool(3)
# see if 1 == 1
ifelse(1 == 1, "Yes", "No")
# see if 1 == 0
ifelse(1 == 0, "Yes", "No")
toTest <- c(1, 1, 0, 1, 0, 1)
ifelse(toTest == 1, "Yes", "No")
ifelse(toTest == 1, toTest*3, toTest)
a <- c(1, 1, 0, 1)
b <- c(2, 1, 0, 1)
# this checks each element of a and each element of b
ifelse(a == 1 & b == 1, "Yes", "No")
ifelse(a == 1 | b == 2, "Yes", "No")
for loop and while loop see these links: