Basic R Programming

Introduction

This cheatsheet is designed for the course of COMS4995 (Empirical Methods of Data Science).

Packages

Installation method: install.packages(“package_name”)

library(ggplot2) #create graphics

library(dplyr) #data manipulation using %>%

library(caTools) #machine learning

library(udpipe) #lemmatization
model <- udpipe_download_model(language = "english")

library(sentimentr) #sentiment analysis

library(wordcloud) #word cloud visualization

library(tidytext) #text mining

Operations on Matrix

Building matrix by “rbind”
Set column names and row names
Extract an element from matrix
Delete specific row from matrix
Visualize matrix by matplot

#Build matrix by "rbind"
row1 <- c("Empirical", "Methods", "Data", "Science")
row2 <- sample(90:100, 4, replace=FALSE)
row3 <- sample(90:100, 4, replace=FALSE)
matrix <- rbind(row1, row2, row3)
matrix

##      [,1]        [,2]      [,3]   [,4]     
## row1 "Empirical" "Methods" "Data" "Science"
## row2 "98"        "96"      "91"   "100"    
## row3 "92"        "100"     "95"   "94"

#Set column names and row names
colnames(matrix) <- c("c1", "c2", "c3", "c4")
rownames(matrix) <- c("r1", "r2", "r3")
matrix

##    c1          c2        c3     c4       
## r1 "Empirical" "Methods" "Data" "Science"
## r2 "98"        "96"      "91"   "100"    
## r3 "92"        "100"     "95"   "94"

#Extract an element from matrix
matrix["r1", "c2"]

## [1] "Methods"

matrix[1, 2]

## [1] "Methods"

#Delete specific row from matrix
matrix <- matrix[-1,]

#Visualize matrix by matplot
matplot(matrix, type = "l")

Operations on Data Frames

Take “mtcars” as an example.
1. Select and exclude specific variables from dataframe
2. Filter by condition: filter(df, condition)

df <- mtcars
#Select specific variables from dataframe
df_2 <- select(df, mpg, disp)
head(df_2, 3)

##                mpg disp
## Mazda RX4     21.0  160
## Mazda RX4 Wag 21.0  160
## Datsun 710    22.8  108

#Exclude specific variable from dataframe
df_3 <- select(df_2, -mpg)
head(df_3, 3)

##               disp
## Mazda RX4      160
## Mazda RX4 Wag  160
## Datsun 710     108

#Filter by condition: filter(df, condition)
df_4 <- filter(df_2, mpg > 30 & disp > 75)
head(df_4, 3)

##               mpg disp
## Fiat 128     32.4 78.7
## Honda Civic  30.4 75.7
## Lotus Europa 30.4 95.1

#Sort in descending order and ascending order
df_5 <- df_4 %>% arrange(desc(mpg), disp)
head(df_5, 3)

##               mpg disp
## Fiat 128     32.4 78.7
## Honda Civic  30.4 75.7
## Lotus Europa 30.4 95.1

Visualization Using GGPLOT

Histogram
Box Plot
Violin Plot

#Histogram
df %>% ggplot(aes(x = mpg)) +
  geom_histogram(bins=30) +
  labs(
    x = 'MPG',
    y = 'Count of MPG'
  )

#Box Plot
df %>% ggplot(aes(x = factor(gear), y = mpg)) +
  geom_boxplot() + 
  labs(
    x = "Gear", 
    y = "MPG"
  )

#Violin Plot
df %>% ggplot(aes(x = factor(gear), y = mpg)) +
  geom_violin() + 
  labs(
    x = "Gear", 
    y = "MPG"
  )

Case Study Using Machine Learning

The given csv file contains students’ evaluations for two distinct professors. Data are collected from “ratemyprofessor.com”.
1. Read csv file
2. Deal with Missing data: either replace numerical values with average, or remove the row with missing categorical values.
3. Encode categorical values
4. Prepare for training and testing sets

#Read csv file and store as a data frame. Use View(prof_df) to view the original dataset. 
prof_df <- read.csv(file = 'Example.csv')
summary(prof_df)

##  Institution         Department        Review_Comment     Review_Quality 
##  Length:27          Length:27          Length:27          Min.   :1.000  
##  Class :character   Class :character   Class :character   1st Qu.:2.000  
##  Mode  :character   Mode  :character   Mode  :character   Median :5.000  
##                                                           Mean   :3.889  
##                                                           3rd Qu.:5.000  
##                                                           Max.   :5.000  
##                                                                          
##  Review_Difficulty Review_Sentiment   Review_Date       
##  Min.   :2.000     Length:27          Length:27         
##  1st Qu.:3.000     Class :character   Class :character  
##  Median :3.000     Mode  :character   Mode  :character  
##  Mean   :3.385                                          
##  3rd Qu.:4.000                                          
##  Max.   :5.000                                          
##  NA's   :1

#View(prof_df)

#Deal with Missing data
prof_df$Review_Difficulty <- ifelse(is.na(prof_df$Review_Difficulty), 
                                    ave(prof_df$Review_Difficulty, 
                                    FUN=function(x) mean(x, na.rm=TRUE)),
                                    prof_df$Review_Difficulty)
prof_df <- prof_df[!(is.na(prof_df$Review_Comment) | prof_df$Review_Comment == ""),]

#Encode categorical values
prof_df$Review_Sentiment <- factor(prof_df$Review_Sentiment,
                                   levels = c("AWESOME", "AWFUL"),
                                   labels = c(1, 0))

#Prepare for training and testing sets
split <- sample.split(prof_df$Review_Quality, SplitRatio = 0.7)
train <- subset(prof_df, split == TRUE)
test <- subset(prof_df, split == FALSE)

Case Study Using Natural Language Programming

Use the same csv file from previous section.
1. Apply data cleaning on “Review_Comment” variable
2. Use udpipe for lemmatization
3. Apply sentiment analysis
4. Count occurrences of words in each sentence
5. Create word cloud

comments <- prof_df$Review_Comment
head(comments, 1)

## [1] "Professor Blazej is the best ! He goes out of his way to help you. Going into this class I thought it was going to be really hard but I was wrong. The class was awesome!!! The material was fun to learn and Professor Blazej helped me out whenever I needed him. He was easy to get a hold of and he responded to all of my e-mails asap."

#Apply data cleaning on "Review_Comment" variable
comments <- tolower(comments) #convert sentences to lower case
comments <- gsub("[[:digit:]]+", " ", comments) #remove digits
comments <- gsub("@.*? ", " ", comments) #remove tags
comments <- gsub("[[:punct:]]", " ", comments) #remove punctuation
comments <- gsub("https.*? ", " ", comments) #remove link
comments <- gsub("<.*?>", " ", comments) #remove html tags
comments <- gsub(",:'!\"", " ", comments) #remove selected punctuation
head(comments, 1)

## [1] "professor blazej is the best   he goes out of his way to help you  going into this class i thought it was going to be really hard but i was wrong  the class was awesome    the material was fun to learn and professor blazej helped me out whenever i needed him  he was easy to get a hold of and he responded to all of my e mails asap "

#Use udpipe for lemmatization
udmodel_english <- udpipe_load_model(model$file_model)
lemma_report <- as.data.frame(udpipe_annotate(udmodel_english, x = comments))
lemma_storage <- paste.data.frame(lemma_report, term = "lemma", group = c("doc_id", "sentence_id"))
lemmatized_comments <- lemma_storage$lemma
head(lemmatized_comments, 1)

## [1] "professor blazej be the best he go out of he way to help you go into this class I think it be go to be really hard but I be wrong the class be awesome the material be fun to learn and professor blazej help I out whenever I need he he be easy to get a hold of and he respond to all of my e mail asap"

#Apply sentiment analysis
sentiment_report <- sentiment_by(comments)
head(sentiment_report, 3)

##    element_id word_count sd ave_sentiment
## 1:          1         69 NA     0.3084587
## 2:          2         59 NA     0.7655108
## 3:          3         57 NA     1.0053201

#Count occurrences of words in each sentence
word_df <- as.data.frame(lemmatized_comments) %>% 
  select('lemmatized_comments') %>% 
  unnest_tokens(word, 'lemmatized_comments') 
word_count_df <- word_df %>% count(word, sort=TRUE)

#Create word cloud
wordcloud(words = word_count_df$word, freq = word_count_df$n, min.freq = 3, max.words=250, random.order=FALSE, rot.per=0.30, colors=brewer.pal(8, "Dark2"))