Here is the code we will be working with. You’ll also need the TWEETS.csv data file.
########## AEJMC 2023 Southeast Colloquium ##################
########## Middle Tennessee State University ################
######### College of Media and Entertainment ################
######## School of Journalism & Strategic Media #############
########### Dive headfirst into R ###########################
######## Prepared by Ken Blake and Jun Zhang ################
#use "#" to enter a comment
########### Simple Math ##########
3+3
50*125
1:10 #colon ":" returns every integer between two integers
##### Objects & Vectors ##########
a<-20 #the environment pane shows all the created variables
A<-20 #R is case sensitive
a <- 6/(4-1) #replace the previous value
a #check the value of the variable
a+5
b <- "R workshop"
b
c<-1:6
d <- c(7, 1.5, 1.5, 6, 0.5, 3)
?c() #use "?" to get help of a particular function
e <- c("Tennessee", "Alabama", "Kentucky", "Georgia", "Tennessee","North Carolina")
d+1
d*c
d %o% c #traditional matrix multiplication
########### Comparison ##########
##return true/false
a>10 #greater than
d == 7 #equal to
c != 10 #NOT equal to
c<=d #less than or equal to
a==3|d==7 #logical operator OR
a==2&e=="Tennessee" #logical operation AND
######### Conditionals ##########
if(a > 5){
print("a is greater than 5")
} else {
print("a is less than 5")
}
########### Packages ############
install.packages("tidyverse") #install packages
install.packages("lubridate")
library(tidyverse) # load the package
library(datasets)
help(package = datasets)
?datasets
########### Functions ###########
head(trees) # Show the first six lines of the dataest
summary(trees)
trees # view what the data frame contains
class(trees) # check the data type
class(trees$Girth)
sort(trees$Girth) # sort from smallest to largest
unique(trees$Girth) # see unique values
ls(trees) #get a get a list of variables in the dataset
ls()
######## Basic Statistics ########
#Numeric data
mean(trees$Girth)
sd(trees$Girth)
sum(trees$Girt)
min(trees$Girth)
max(trees$Girth)
length(trees$Girth) #check the length of a vector
#Categorical data
summary(e)
length(e)
unique(e)
#Clear the workspace
rm(A)
rm(list = ls())
########### Exercise ##############
#Create an R object called ID and assign it values 1 to 7
#Create an R object called location and assign it values Tennessee, Alabama, Kentucky, Georgia, Tennessee, North Carolina, Tennessee
#Create an R object called followers and assign it values 2525, 908, 1560, 2687, 525, 1237, 667
ID <- 1:7
location <- c("Tennessee", "Alabama", "Kentucky", "Georgia", "Tennessee", "North Carolina", "Tennessee")
followers <- c(2525, 908, 1560, 2687, 525, 1237, 667)
########### Data frame ############
account<-data.frame(id=ID, location=location, followers=followers)
#"id", "location", "followers" are headers of the data frame
account
summary(account)
#Specify Elements Within a Data Object
account[1,] # Display the 1st row of the data frame.
account[,2] #Display the 2nd column of the data frame
# Take a subset of data
account_TN <-account[account$location == "Tennessee",] #create a data frame that contains only accounts locate at TN
# Aggregate data
aggregate(account$followers, list(account$location), FUN = mean)
#Replace values
account$location<-sub("Tennessee", "TN", account$location)
##### Load & Export Data #######
#work directory
getwd()
setwd()
#Load data
TWEETS<-read.csv("TWEETS.csv", header=TRUE, sep=",")
#If your file is not in the working directory, you need to specify the file path.
TWEETS_sample <-TWEETS[sample(1:nrow(TWEETS), 1000, replace=FALSE),]
#Take a random sample of 1,000 tweets from the original dataset
#"replace=FALSE": each item could only be selected once
#Export data
write.csv(TWEETS_sample, "TWEETS_sample.csv")
#Inspecting data & Basic statistics
names(TWEETS_sample) #Return the header names
head(TWEETS_sample) #Return the first part of an object. You can specify the number of rows, e.g., head(trump_sample, 10). Otherwise, it will return the first six rows of data.
tail(TWEETS_sample) #Return the last part of an object
summary(TWEETS_sample)
nrow(TWEETS_sample) #Return the total number of rows
ncol(TWEETS_sample) #the total number of columns
unique(TWEETS_sample$Author) #Display unique values of a column
length(unique(TWEETS_sample$Author)) #Count the number of unique values