First, I creat the 10 components for the list.
v1 <- runif(1,1,100)
v2 <- c(runif(2,1,100))
m3 <- matrix(runif(3*3,1,100),3,3)
m4 <- matrix(runif(4*4,1,100),4,4)
m5 <- matrix(runif(5*5,1,100),5,5)
m6 <- matrix(runif(6*6,1,100),6,6)
m7 <- matrix(runif(7*7,1,100),7,7)
m8 <- matrix(runif(8*8,1,100),8,8)
m9 <- matrix(runif(9*9,1,100),9,9)
m10 <- matrix(runif(10*10,1,100),10,10)
Then, I create the list using the 10 components with different dimensions.
list <- list(v1,v2,m3,m4,m5,m6,m7,m8,m9,m10)
Values larger than 50 are selected from each component and there will be 10 such groups. For each group, if the length is larger than 1, a value will be calculated by deviding the standard deviation by the mean of the values. If the length is smaller than 1, 0 will be returned. As a result, there will be 10 such values.
finalvalue1 <- capture.output(for (i in 1:10){
N = list[[i]][list[[i]]>50]
if (length(N)>1){
print(sd(N)/mean(N))
}else{
print(0)
}})
Use the function ‘r cat’ to remove the names in the print. And change the data type from string to numbers by using function ‘r as.numeric’.
finalvalue1 <- capture.output(for (i in 1:10){
N = list[[i]][list[[i]]>50]
if (length(N)>1){
cat(sd(N)/mean(N),"\n")
}else{
cat(0,"\n")
}})
finalvalue1 <- as.numeric(finalvalue1)
Calculate the sum of the 10 values.
sum1 <- sum(finalvalue1)
Repeat the process many times
v1 <- runif(1,1,100)
v2 <- c(runif(2,1,100))
m3 <- matrix(runif(3*3,1,100),3,3)
m4 <- matrix(runif(4*4,1,100),4,4)
m5 <- matrix(runif(5*5,1,100),5,5)
m6 <- matrix(runif(6*6,1,100),6,6)
m7 <- matrix(runif(7*7,1,100),7,7)
m8 <- matrix(runif(8*8,1,100),8,8)
m9 <- matrix(runif(9*9,1,100),9,9)
m10 <- matrix(runif(10*10,1,100),10,10)
list <- list(v1,v2,m3,m4,m5,m6,m7,m8,m9,m10)
finalvalue1 <- capture.output(for (i in 1:10){
N = list[[i]][list[[i]]>50]
if (length(N)>1){
cat(sd(N)/mean(N),"\n")
}else{
cat(0,"\n")
}})
finalvalue1 <- as.numeric(finalvalue1)
sum1 <- sum(finalvalue1)
Fingdings: after running the codes many times, I think the average value of sum1 would be 1.5.
Read the file into R
library(readr)
authorlist <- read_csv("~/Desktop/COMM6320_IntroductionR-master/authorlist.csv")
## Parsed with column specification:
## cols(
## OID = col_integer(),
## UID = col_integer(),
## `Author Name` = col_character(),
## Location = col_character(),
## College = col_character(),
## Discipline = col_character(),
## `Discipline(department)` = col_character(),
## Email = col_character()
## )
Select the 3rd (Author Name) and 6th (Discipline) columns to form a new data frame audisc. After checkig the basic feature of the variable Discipline, I change the value NULL to NA. I also notice there is some typo in the Discipline and I will deal with the name after I split the words.
audisc <- authorlist[,c(3,6)]
library(descr)
freq(audisc$Discipline)
## audisc$Discipline
## Frequency Percent
## Anthropology 9 1.7822
## Behavior 1 0.1980
## Biology 10 1.9802
## Biology,Epigenomics,Medicine 1 0.1980
## Biology,Informatics 2 0.3960
## Biology,Medicine,Informatics 5 0.9901
## Biology,Social Science 1 0.1980
## Biology,Statistics 4 0.7921
## Biology,Statistics, Epidemiology,Medicine 1 0.1980
## Biology,Statistics,Disease 1 0.1980
## Biology,Zoology,Medicine 1 0.1980
## Business 5 0.9901
## Communication 4 0.7921
## Computer Science 29 5.7426
## Computer Science,Communication 1 0.1980
## Computer Science,Engineering 3 0.5941
## Computer Science,Epidemiology 1 0.1980
## Computer Science,Health 1 0.1980
## Computer Science,Medicine 1 0.1980
## Computer science 2 0.3960
## Computer science,Engineering 2 0.3960
## Computer,Math 1 0.1980
## Disease 30 5.9406
## Disease;Agriculture 1 0.1980
## Diseases 8 1.5842
## Economics 1 0.1980
## Economics,Public Health,Medicine 1 0.1980
## Education 1 0.1980
## Engineering 48 9.5050
## Engineering,Computer Science 7 1.3861
## Engineering,Health,Medicine 1 0.1980
## Engineering,Medicine 1 0.1980
## Environment 4 0.7921
## Environment,Health 2 0.3960
## Epidemiology 25 4.9505
## Epidemiology,Biology,Statistics 1 0.1980
## Epidemiology,Health 5 0.9901
## Epidemiology,Health,Social Science 1 0.1980
## Geography 4 0.7921
## Health 73 14.4554
## Health,Management 1 0.1980
## Health,Math 2 0.3960
## Health,Medicine 3 0.5941
## Health,Physics,Math,Computer 1 0.1980
## Health,Statistics 1 0.1980
## Informatics 10 1.9802
## Informatics,Medicine 5 0.9901
## Informatics,Medicine,Epidemiology,Biostatistics 1 0.1980
## Journalism 1 0.1980
## Knowledge Structure 1 0.1980
## Knowledge Structure,Science 1 0.1980
## Language 3 0.5941
## Language,Computer Science 5 0.9901
## Management 9 1.7822
## Management,Medicine 1 0.1980
## Math 5 0.9901
## Math,Computer,Epidemiology 1 0.1980
## Math,Computer,Social Science 1 0.1980
## Math,Statistics 3 0.5941
## Medicine 79 15.6436
## Medicine,Agriculture 2 0.3960
## Medicine,Computer Science 1 0.1980
## Medicine,Disease 2 0.3960
## Medicine,Engineering 3 0.5941
## Medicine,Epidemiology,Biology,Statistics 1 0.1980
## Medicine,Health 8 1.5842
## Medicine,Informatics 3 0.5941
## Medicine,Social Sciences 1 0.1980
## NULL 23 4.5545
## Operations Research 1 0.1980
## Politics 1 0.1980
## Psychology 3 0.5941
## Psychology,Medicine 1 0.1980
## Science 4 0.7921
## Sciences,Health 1 0.1980
## Social Science 3 0.5941
## Social Science,Anthropology 1 0.1980
## Statistics 5 0.9901
## Statistics,Disease 3 0.5941
## Veterinary Medicine 1 0.1980
## informatics 2 0.3960
## informatics,Computer science 1 0.1980
## Total 505 100.0000
audisc$Discipline[audisc$Discipline == "NULL"] <- NA
Replace “;” to “,”, and split the variable Discpline with “,”.
audisc$Discipline <- gsub(";",",",audisc$Discipline)
Dis_split <- sapply(audisc$Discipline, function(x)strsplit(x,split = ","))
I have to find out the maximum length of the values in the variable Dis_Split.
leg <- sapply(Dis_split,function(x)length(x))
max(leg)
## [1] 4
Extract the first word from the values
Dis_1 <- sapply(Dis_split,function(x)x[[1]])
Extract the second, third and forth word from the values
Dis_2 <- sapply(Dis_split,function(x) ifelse(length(x)>1, x[[2]],NA))
Dis_3 <- sapply(Dis_split,function(x) ifelse(length(x)>2, x[[3]],NA))
Dis_4 <- sapply(Dis_split,function(x) ifelse(length(x)>3, x[[4]],NA))
Change the data stracture of Dis_1, Dis_2, Dis_3 and Dis_4 into to characer
Dis_1 <- as.character(Dis_1)
Dis_2 <- as.character(Dis_2)
Dis_3 <- as.character(Dis_3)
Dis_4 <- as.character(Dis_4)
Recode similar discipines into the united one. e.g., Computer science to Computer Science; Disease to Diseases.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Dis_1 <- recode(Dis_1, "Computer science"="Computer Science","Computer"="Computer Science","Disease"="Diseases","Science"="Sciences","informatics"="Informatics")
Dis_2 <- recode(Dis_2, "Computer science"="Computer Science","Computer"="Computer Science","Disease"="Diseases","Social Science"="Social Sciences")
Dis_3 <- recode(Dis_3, " Epidemiology"="Epidemiology","Disease"="Diseases","Social Science"="Social Sciences")
Dis_4 <- recode(Dis_4,"Computer"="Computer Science")
Create data frames combining author names with dispines to calculate the frequency.
Disau1 <- data.frame(audisc$`Author Name`,Dis_1)
Disau1$freq[!is.na(Disau1$Dis_1)] <- 1
Disau2 <- data.frame(audisc$`Author Name`,Dis_2)
Disau2$freq[!is.na(Disau2$Dis_2)] <- 1
Disau3 <- data.frame(audisc$`Author Name`,Dis_3)
Disau3$freq[!is.na(Disau3$Dis_3)] <- 1
Disau4 <- data.frame(audisc$`Author Name`,Dis_4)
Disau4$freq[!is.na(Disau4$Dis_4)] <- 1
Reshape the 4 data frames to matrix showing the relationship between author name and discipline.
library(reshape)
##
## Attaching package: 'reshape'
## The following object is masked from 'package:dplyr':
##
## rename
Disau1 <- cast(Disau1, Dis_1~audisc..Author.Name.,sum)
## Using freq as value column. Use the value argument to cast to override this choice
Disau2 <- cast(Disau2, Dis_2~audisc..Author.Name.,sum)
## Using freq as value column. Use the value argument to cast to override this choice
Disau3 <- cast(Disau3, Dis_3~audisc..Author.Name.,sum)
## Using freq as value column. Use the value argument to cast to override this choice
Disau4 <- cast(Disau4, Dis_4~audisc..Author.Name.,sum)
## Using freq as value column. Use the value argument to cast to override this choice
Remove all the missing values.
Disau1 <- subset(Disau1,(!is.na(Disau1[,1])))
Disau2 <- subset(Disau2,(!is.na(Disau2[,1])))
Disau3 <- subset(Disau3,(!is.na(Disau3[,1])))
Disau4 <- subset(Disau4,(!is.na(Disau4[,1])))
Rename the first column of each matrix for later merging.
colnames(Disau1)[1] <- "Discipline"
colnames(Disau2)[1] <- "Discipline"
colnames(Disau3)[1] <- "Discipline"
colnames(Disau4)[1] <- "Discipline"
Merge the four data frames by rows.
Dis_matrix <- rbind(Disau1,Disau2,Disau3,Disau4)
Dis_matrix <- aggregate(.~Discipline, Dis_matrix,sum)
Set the discipline name as header.
rownames(Dis_matrix) <- Dis_matrix[,1]
Dis_matrix <- Dis_matrix[,-1]
Create the final co-occurence matrix and change the data structure into numeric.
Dis_matrix <- data.matrix(Dis_matrix)
Dis_matrix[is.na(Dis_matrix)]=0
Dis_matrix_tran <- t(Dis_matrix)
Dis_occ <- Dis_matrix %*% Dis_matrix_tran