This cheatsheet is designed for the course of COMS4995 (Empirical Methods of Data Science).
Installation method: install.packages(“package_name”)
library(ggplot2) #create graphics
library(dplyr) #data manipulation using %>%
library(caTools) #machine learning
library(udpipe) #lemmatization
model <- udpipe_download_model(language = "english")
library(sentimentr) #sentiment analysis
library(wordcloud) #word cloud visualization
library(tidytext) #text mining
#Build matrix by "rbind"
row1 <- c("Empirical", "Methods", "Data", "Science")
row2 <- sample(90:100, 4, replace=FALSE)
row3 <- sample(90:100, 4, replace=FALSE)
matrix <- rbind(row1, row2, row3)
matrix
## [,1] [,2] [,3] [,4]
## row1 "Empirical" "Methods" "Data" "Science"
## row2 "98" "96" "91" "100"
## row3 "92" "100" "95" "94"
#Set column names and row names
colnames(matrix) <- c("c1", "c2", "c3", "c4")
rownames(matrix) <- c("r1", "r2", "r3")
matrix
## c1 c2 c3 c4
## r1 "Empirical" "Methods" "Data" "Science"
## r2 "98" "96" "91" "100"
## r3 "92" "100" "95" "94"
#Extract an element from matrix
matrix["r1", "c2"]
## [1] "Methods"
matrix[1, 2]
## [1] "Methods"
#Delete specific row from matrix
matrix <- matrix[-1,]
#Visualize matrix by matplot
matplot(matrix, type = "l")
Take “mtcars” as an example.
1. Select and exclude specific variables from dataframe
2. Filter by condition: filter(df, condition)
df <- mtcars
#Select specific variables from dataframe
df_2 <- select(df, mpg, disp)
head(df_2, 3)
## mpg disp
## Mazda RX4 21.0 160
## Mazda RX4 Wag 21.0 160
## Datsun 710 22.8 108
#Exclude specific variable from dataframe
df_3 <- select(df_2, -mpg)
head(df_3, 3)
## disp
## Mazda RX4 160
## Mazda RX4 Wag 160
## Datsun 710 108
#Filter by condition: filter(df, condition)
df_4 <- filter(df_2, mpg > 30 & disp > 75)
head(df_4, 3)
## mpg disp
## Fiat 128 32.4 78.7
## Honda Civic 30.4 75.7
## Lotus Europa 30.4 95.1
#Sort in descending order and ascending order
df_5 <- df_4 %>% arrange(desc(mpg), disp)
head(df_5, 3)
## mpg disp
## Fiat 128 32.4 78.7
## Honda Civic 30.4 75.7
## Lotus Europa 30.4 95.1
#Histogram
df %>% ggplot(aes(x = mpg)) +
geom_histogram(bins=30) +
labs(
x = 'MPG',
y = 'Count of MPG'
)
#Box Plot
df %>% ggplot(aes(x = factor(gear), y = mpg)) +
geom_boxplot() +
labs(
x = "Gear",
y = "MPG"
)
#Violin Plot
df %>% ggplot(aes(x = factor(gear), y = mpg)) +
geom_violin() +
labs(
x = "Gear",
y = "MPG"
)
The given csv file contains students’ evaluations for two
distinct professors. Data are collected from
“ratemyprofessor.com”.
1. Read csv file
2. Deal with Missing data: either replace numerical values with average,
or remove the row with missing categorical values.
3. Encode categorical values
4. Prepare for training and testing sets
#Read csv file and store as a data frame. Use View(prof_df) to view the original dataset.
prof_df <- read.csv(file = 'Example.csv')
summary(prof_df)
## Institution Department Review_Comment Review_Quality
## Length:27 Length:27 Length:27 Min. :1.000
## Class :character Class :character Class :character 1st Qu.:2.000
## Mode :character Mode :character Mode :character Median :5.000
## Mean :3.889
## 3rd Qu.:5.000
## Max. :5.000
##
## Review_Difficulty Review_Sentiment Review_Date
## Min. :2.000 Length:27 Length:27
## 1st Qu.:3.000 Class :character Class :character
## Median :3.000 Mode :character Mode :character
## Mean :3.385
## 3rd Qu.:4.000
## Max. :5.000
## NA's :1
#View(prof_df)
#Deal with Missing data
prof_df$Review_Difficulty <- ifelse(is.na(prof_df$Review_Difficulty),
ave(prof_df$Review_Difficulty,
FUN=function(x) mean(x, na.rm=TRUE)),
prof_df$Review_Difficulty)
prof_df <- prof_df[!(is.na(prof_df$Review_Comment) | prof_df$Review_Comment == ""),]
#Encode categorical values
prof_df$Review_Sentiment <- factor(prof_df$Review_Sentiment,
levels = c("AWESOME", "AWFUL"),
labels = c(1, 0))
#Prepare for training and testing sets
split <- sample.split(prof_df$Review_Quality, SplitRatio = 0.7)
train <- subset(prof_df, split == TRUE)
test <- subset(prof_df, split == FALSE)
Use the same csv file from previous section.
1. Apply data cleaning on “Review_Comment” variable
2. Use udpipe for lemmatization
3. Apply sentiment analysis
4. Count occurrences of words in each sentence
5. Create word cloud
comments <- prof_df$Review_Comment
head(comments, 1)
## [1] "Professor Blazej is the best ! He goes out of his way to help you. Going into this class I thought it was going to be really hard but I was wrong. The class was awesome!!! The material was fun to learn and Professor Blazej helped me out whenever I needed him. He was easy to get a hold of and he responded to all of my e-mails asap."
#Apply data cleaning on "Review_Comment" variable
comments <- tolower(comments) #convert sentences to lower case
comments <- gsub("[[:digit:]]+", " ", comments) #remove digits
comments <- gsub("@.*? ", " ", comments) #remove tags
comments <- gsub("[[:punct:]]", " ", comments) #remove punctuation
comments <- gsub("https.*? ", " ", comments) #remove link
comments <- gsub("<.*?>", " ", comments) #remove html tags
comments <- gsub(",:'!\"", " ", comments) #remove selected punctuation
head(comments, 1)
## [1] "professor blazej is the best he goes out of his way to help you going into this class i thought it was going to be really hard but i was wrong the class was awesome the material was fun to learn and professor blazej helped me out whenever i needed him he was easy to get a hold of and he responded to all of my e mails asap "
#Use udpipe for lemmatization
udmodel_english <- udpipe_load_model(model$file_model)
lemma_report <- as.data.frame(udpipe_annotate(udmodel_english, x = comments))
lemma_storage <- paste.data.frame(lemma_report, term = "lemma", group = c("doc_id", "sentence_id"))
lemmatized_comments <- lemma_storage$lemma
head(lemmatized_comments, 1)
## [1] "professor blazej be the best he go out of he way to help you go into this class I think it be go to be really hard but I be wrong the class be awesome the material be fun to learn and professor blazej help I out whenever I need he he be easy to get a hold of and he respond to all of my e mail asap"
#Apply sentiment analysis
sentiment_report <- sentiment_by(comments)
head(sentiment_report, 3)
## element_id word_count sd ave_sentiment
## 1: 1 69 NA 0.3084587
## 2: 2 59 NA 0.7655108
## 3: 3 57 NA 1.0053201
#Count occurrences of words in each sentence
word_df <- as.data.frame(lemmatized_comments) %>%
select('lemmatized_comments') %>%
unnest_tokens(word, 'lemmatized_comments')
word_count_df <- word_df %>% count(word, sort=TRUE)
#Create word cloud
wordcloud(words = word_count_df$word, freq = word_count_df$n, min.freq = 3, max.words=250, random.order=FALSE, rot.per=0.30, colors=brewer.pal(8, "Dark2"))