Fast review of basic R

Objects in R

# Use <- or = to assign things to objects
k <- c("Hello", 345)
k
[1] "Hello" "345"  
# Check class
class(k)
[1] "character"
# Summarize object
summary(k)
   Length     Class      Mode 
        2 character character 
# Numeric object
l <- c(51,100,511,5100,51000)
l
[1]    51   100   511  5100 51000
class(l)
[1] "numeric"

Different types of objects

# Matrix object
k <- matrix(
  data = c(1,1,1,1),
  nrow = 2,
  ncol = 2
)
k
     [,1] [,2]
[1,]    1    1
[2,]    1    1
k <- matrix(
  data = c(2,6,8,1),
  nrow = 4,
  ncol = 1
)
k
     [,1]
[1,]    2
[2,]    6
[3,]    8
[4,]    1

Different types of objects

class(k)
[1] "matrix" "array" 

Reading in csv files in R

customers <- read.csv("C:/Users/gacas/Downloads/customers1000.csv")
organizations <- read.csv("C:/Users/gacas/Downloads/organizations1000.csv")

# read.table()
# install packages with install.packages()

Common Data Wrangling Techniques

  1. Filtering Data (selecting specific columns or rows)
  2. Combining Datasets
  3. Summarizing Datasets

data.frame Object

library(dplyr)

# Creating data frame and assigning it to a object named "data"
data <- data.frame(
  ID = 1:5,
  Name = c("Alice", "Bob", "Charlie", "David", "Eve"),
  Age = c(25, 30, 22, 35, 28),
  City = c("NY", "LA", "Chicago", "LA", "Boston"),
  Score = c(85, 92, 78, 88, 95)
)

print(data)
  ID    Name Age    City Score
1  1   Alice  25      NY    85
2  2     Bob  30      LA    92
3  3 Charlie  22 Chicago    78
4  4   David  35      LA    88
5  5     Eve  28  Boston    95

Filtering Data using dyplr

data %>% filter(
  Name == c("Alice", "Bob")
)
  ID  Name Age City Score
1  1 Alice  25   NY    85
2  2   Bob  30   LA    92
data %>% filter(
  Age < 30
)
  ID    Name Age    City Score
1  1   Alice  25      NY    85
2  3 Charlie  22 Chicago    78
3  5     Eve  28  Boston    95

Selecting columns using dplyr

data %>% select(
  Name, City
)
     Name    City
1   Alice      NY
2     Bob      LA
3 Charlie Chicago
4   David      LA
5     Eve  Boston
data %>% select(
  Age,Score
)
  Age Score
1  25    85
2  30    92
3  22    78
4  35    88
5  28    95

Filtering data and selecting columns

data %>% select(Name, City) %>% filter(Name=="Bob")
  Name City
1  Bob   LA
data %>% filter(Age < 30) %>% select(Score, City)
  Score    City
1    85      NY
2    78 Chicago
3    95  Boston

Creating new data.frames from formatted data

mydata = data %>% select(Age, Score) 
print(mydata)
  Age Score
1  25    85
2  30    92
3  22    78
4  35    88
5  28    95

Calculations

print(data)
  ID    Name Age    City Score
1  1   Alice  25      NY    85
2  2     Bob  30      LA    92
3  3 Charlie  22 Chicago    78
4  4   David  35      LA    88
5  5     Eve  28  Boston    95
summary(data)
       ID        Name                Age         City               Score     
 Min.   :1   Length:5           Min.   :22   Length:5           Min.   :78.0  
 1st Qu.:2   Class :character   1st Qu.:25   Class :character   1st Qu.:85.0  
 Median :3   Mode  :character   Median :28   Mode  :character   Median :88.0  
 Mean   :3                      Mean   :28                      Mean   :87.6  
 3rd Qu.:4                      3rd Qu.:30                      3rd Qu.:92.0  
 Max.   :5                      Max.   :35                      Max.   :95.0  
mean(data$Age)
[1] 28

Ways you can filter an object

# Using the $ operator
data$Age
[1] 25 30 22 35 28
# Using brackets []
data[,3] 
[1] 25 30 22 35 28

Bigger datasets

library(ggplot2)
data(diamonds)
diamonds
# A tibble: 53,940 × 10
   carat cut       color clarity depth table price     x     y     z
   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
 1  0.23 Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
 2  0.21 Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
 3  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31
 4  0.29 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
 5  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75
 6  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48
 7  0.24 Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47
 8  0.26 Very Good H     SI1      61.9    55   337  4.07  4.11  2.53
 9  0.22 Fair      E     VS2      65.1    61   337  3.87  3.78  2.49
10  0.23 Very Good H     VS1      59.4    61   338  4     4.05  2.39
# ℹ 53,930 more rows

The table function

table(diamonds$cut)

     Fair      Good Very Good   Premium     Ideal 
     1610      4906     12082     13791     21551 

Summarizing Data with summarize()

head(diamonds %>% filter(cut == "Ideal"))
# A tibble: 6 × 10
  carat cut   color clarity depth table price     x     y     z
  <dbl> <ord> <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
1  0.23 Ideal E     SI2      61.5    55   326  3.95  3.98  2.43
2  0.23 Ideal J     VS1      62.8    56   340  3.93  3.9   2.46
3  0.31 Ideal J     SI2      62.2    54   344  4.35  4.37  2.71
4  0.3  Ideal I     SI2      62      54   348  4.31  4.34  2.68
5  0.33 Ideal I     SI2      61.8    55   403  4.49  4.51  2.78
6  0.33 Ideal I     SI2      61.2    56   403  4.49  4.5   2.75
diamonds %>% filter(cut == "Ideal") %>% dplyr::summarise(AveragePrice = mean(price))
# A tibble: 1 × 1
  AveragePrice
         <dbl>
1        3458.
# This is the same thing as doing
new.dataset <- diamonds %>% filter(cut == "Ideal")
mean(new.dataset$price)
[1] 3457.542

Summarizing Data in groups with group_by()

diamonds %>% group_by(cut) %>% summarise(AveragePrice = mean(price))
# A tibble: 5 × 2
  cut       AveragePrice
  <ord>            <dbl>
1 Fair             4359.
2 Good             3929.
3 Very Good        3982.
4 Premium          4584.
5 Ideal            3458.