Introduction

This cheatsheet is designed for the course of COMS4995 (Empirical Methods of Data Science).

Packages

Installation method: install.packages(“package_name”)

library(ggplot2) #create graphics

library(dplyr) #data manipulation using %>%

library(caTools) #machine learning

library(udpipe) #lemmatization
model <- udpipe_download_model(language = "english")

library(sentimentr) #sentiment analysis

library(wordcloud) #word cloud visualization

library(tidytext) #text mining

Operations on Matrix

  1. Building matrix by “rbind”
  2. Set column names and row names
  3. Extract an element from matrix
  4. Delete specific row from matrix
  5. Visualize matrix by matplot
#Build matrix by "rbind"
row1 <- c("Empirical", "Methods", "Data", "Science")
row2 <- sample(90:100, 4, replace=FALSE)
row3 <- sample(90:100, 4, replace=FALSE)
matrix <- rbind(row1, row2, row3)
matrix
##      [,1]        [,2]      [,3]   [,4]     
## row1 "Empirical" "Methods" "Data" "Science"
## row2 "98"        "96"      "91"   "100"    
## row3 "92"        "100"     "95"   "94"
#Set column names and row names
colnames(matrix) <- c("c1", "c2", "c3", "c4")
rownames(matrix) <- c("r1", "r2", "r3")
matrix
##    c1          c2        c3     c4       
## r1 "Empirical" "Methods" "Data" "Science"
## r2 "98"        "96"      "91"   "100"    
## r3 "92"        "100"     "95"   "94"
#Extract an element from matrix
matrix["r1", "c2"]
## [1] "Methods"
matrix[1, 2]
## [1] "Methods"
#Delete specific row from matrix
matrix <- matrix[-1,]

#Visualize matrix by matplot
matplot(matrix, type = "l")

Operations on Data Frames

Take “mtcars” as an example.
1. Select and exclude specific variables from dataframe
2. Filter by condition: filter(df, condition)

df <- mtcars
#Select specific variables from dataframe
df_2 <- select(df, mpg, disp)
head(df_2, 3)
##                mpg disp
## Mazda RX4     21.0  160
## Mazda RX4 Wag 21.0  160
## Datsun 710    22.8  108
#Exclude specific variable from dataframe
df_3 <- select(df_2, -mpg)
head(df_3, 3)
##               disp
## Mazda RX4      160
## Mazda RX4 Wag  160
## Datsun 710     108
#Filter by condition: filter(df, condition)
df_4 <- filter(df_2, mpg > 30 & disp > 75)
head(df_4, 3)
##               mpg disp
## Fiat 128     32.4 78.7
## Honda Civic  30.4 75.7
## Lotus Europa 30.4 95.1
#Sort in descending order and ascending order
df_5 <- df_4 %>% arrange(desc(mpg), disp)
head(df_5, 3)
##               mpg disp
## Fiat 128     32.4 78.7
## Honda Civic  30.4 75.7
## Lotus Europa 30.4 95.1

Visualization Using GGPLOT

  1. Histogram
  2. Box Plot
  3. Violin Plot
#Histogram
df %>% ggplot(aes(x = mpg)) +
  geom_histogram(bins=30) +
  labs(
    x = 'MPG',
    y = 'Count of MPG'
  )

#Box Plot
df %>% ggplot(aes(x = factor(gear), y = mpg)) +
  geom_boxplot() + 
  labs(
    x = "Gear", 
    y = "MPG"
  )

#Violin Plot
df %>% ggplot(aes(x = factor(gear), y = mpg)) +
  geom_violin() + 
  labs(
    x = "Gear", 
    y = "MPG"
  )

Case Study Using Machine Learning

The given csv file contains students’ evaluations for two distinct professors. Data are collected from “ratemyprofessor.com”.
1. Read csv file
2. Deal with Missing data: either replace numerical values with average, or remove the row with missing categorical values.
3. Encode categorical values
4. Prepare for training and testing sets

#Read csv file and store as a data frame. Use View(prof_df) to view the original dataset. 
prof_df <- read.csv(file = 'Example.csv')
summary(prof_df)
##  Institution         Department        Review_Comment     Review_Quality 
##  Length:27          Length:27          Length:27          Min.   :1.000  
##  Class :character   Class :character   Class :character   1st Qu.:2.000  
##  Mode  :character   Mode  :character   Mode  :character   Median :5.000  
##                                                           Mean   :3.889  
##                                                           3rd Qu.:5.000  
##                                                           Max.   :5.000  
##                                                                          
##  Review_Difficulty Review_Sentiment   Review_Date       
##  Min.   :2.000     Length:27          Length:27         
##  1st Qu.:3.000     Class :character   Class :character  
##  Median :3.000     Mode  :character   Mode  :character  
##  Mean   :3.385                                          
##  3rd Qu.:4.000                                          
##  Max.   :5.000                                          
##  NA's   :1
#View(prof_df)

#Deal with Missing data
prof_df$Review_Difficulty <- ifelse(is.na(prof_df$Review_Difficulty), 
                                    ave(prof_df$Review_Difficulty, 
                                    FUN=function(x) mean(x, na.rm=TRUE)),
                                    prof_df$Review_Difficulty)
prof_df <- prof_df[!(is.na(prof_df$Review_Comment) | prof_df$Review_Comment == ""),]

#Encode categorical values
prof_df$Review_Sentiment <- factor(prof_df$Review_Sentiment,
                                   levels = c("AWESOME", "AWFUL"),
                                   labels = c(1, 0))

#Prepare for training and testing sets
split <- sample.split(prof_df$Review_Quality, SplitRatio = 0.7)
train <- subset(prof_df, split == TRUE)
test <- subset(prof_df, split == FALSE)

Case Study Using Natural Language Programming

Use the same csv file from previous section.
1. Apply data cleaning on “Review_Comment” variable
2. Use udpipe for lemmatization
3. Apply sentiment analysis
4. Count occurrences of words in each sentence
5. Create word cloud

comments <- prof_df$Review_Comment
head(comments, 1)
## [1] "Professor Blazej is the best ! He goes out of his way to help you. Going into this class I thought it was going to be really hard but I was wrong. The class was awesome!!! The material was fun to learn and Professor Blazej helped me out whenever I needed him. He was easy to get a hold of and he responded to all of my e-mails asap."
#Apply data cleaning on "Review_Comment" variable
comments <- tolower(comments) #convert sentences to lower case
comments <- gsub("[[:digit:]]+", " ", comments) #remove digits
comments <- gsub("@.*? ", " ", comments) #remove tags
comments <- gsub("[[:punct:]]", " ", comments) #remove punctuation
comments <- gsub("https.*? ", " ", comments) #remove link
comments <- gsub("<.*?>", " ", comments) #remove html tags
comments <- gsub(",:'!\"", " ", comments) #remove selected punctuation
head(comments, 1)
## [1] "professor blazej is the best   he goes out of his way to help you  going into this class i thought it was going to be really hard but i was wrong  the class was awesome    the material was fun to learn and professor blazej helped me out whenever i needed him  he was easy to get a hold of and he responded to all of my e mails asap "
#Use udpipe for lemmatization
udmodel_english <- udpipe_load_model(model$file_model)
lemma_report <- as.data.frame(udpipe_annotate(udmodel_english, x = comments))
lemma_storage <- paste.data.frame(lemma_report, term = "lemma", group = c("doc_id", "sentence_id"))
lemmatized_comments <- lemma_storage$lemma
head(lemmatized_comments, 1)
## [1] "professor blazej be the best he go out of he way to help you go into this class I think it be go to be really hard but I be wrong the class be awesome the material be fun to learn and professor blazej help I out whenever I need he he be easy to get a hold of and he respond to all of my e mail asap"
#Apply sentiment analysis
sentiment_report <- sentiment_by(comments)
head(sentiment_report, 3)
##    element_id word_count sd ave_sentiment
## 1:          1         69 NA     0.3084587
## 2:          2         59 NA     0.7655108
## 3:          3         57 NA     1.0053201
#Count occurrences of words in each sentence
word_df <- as.data.frame(lemmatized_comments) %>% 
  select('lemmatized_comments') %>% 
  unnest_tokens(word, 'lemmatized_comments') 
word_count_df <- word_df %>% count(word, sort=TRUE)

#Create word cloud
wordcloud(words = word_count_df$word, freq = word_count_df$n, min.freq = 3, max.words=250, random.order=FALSE, rot.per=0.30, colors=brewer.pal(8, "Dark2"))