Assignment 1.1

Step 1

First, I creat the 10 components for the list.

v1 <- runif(1,1,100)
v2 <- c(runif(2,1,100))
m3 <- matrix(runif(3*3,1,100),3,3)
m4 <- matrix(runif(4*4,1,100),4,4)
m5 <- matrix(runif(5*5,1,100),5,5)
m6 <- matrix(runif(6*6,1,100),6,6)
m7 <- matrix(runif(7*7,1,100),7,7)
m8 <- matrix(runif(8*8,1,100),8,8)
m9 <- matrix(runif(9*9,1,100),9,9)
m10 <- matrix(runif(10*10,1,100),10,10)

Then, I create the list using the 10 components with different dimensions.

list <- list(v1,v2,m3,m4,m5,m6,m7,m8,m9,m10)

Step 2

Values larger than 50 are selected from each component and there will be 10 such groups. For each group, if the length is larger than 1, a value will be calculated by deviding the standard deviation by the mean of the values. If the length is smaller than 1, 0 will be returned. As a result, there will be 10 such values.

finalvalue1 <- capture.output(for (i in 1:10){
  N = list[[i]][list[[i]]>50]
  if (length(N)>1){
    print(sd(N)/mean(N))
  }else{
      print(0)
  }})

Step 3

Use the function ‘r cat’ to remove the names in the print. And change the data type from string to numbers by using function ‘r as.numeric’.

finalvalue1 <- capture.output(for (i in 1:10){
  N = list[[i]][list[[i]]>50]
  if (length(N)>1){
    cat(sd(N)/mean(N),"\n")
  }else{
      cat(0,"\n")
  }})
finalvalue1 <- as.numeric(finalvalue1)

Step 4

Calculate the sum of the 10 values.

sum1 <- sum(finalvalue1)

Step 5

Repeat the process many times

v1 <- runif(1,1,100)
v2 <- c(runif(2,1,100))
m3 <- matrix(runif(3*3,1,100),3,3)
m4 <- matrix(runif(4*4,1,100),4,4)
m5 <- matrix(runif(5*5,1,100),5,5)
m6 <- matrix(runif(6*6,1,100),6,6)
m7 <- matrix(runif(7*7,1,100),7,7)
m8 <- matrix(runif(8*8,1,100),8,8)
m9 <- matrix(runif(9*9,1,100),9,9)
m10 <- matrix(runif(10*10,1,100),10,10)
list <- list(v1,v2,m3,m4,m5,m6,m7,m8,m9,m10)
finalvalue1 <- capture.output(for (i in 1:10){
  N = list[[i]][list[[i]]>50]
  if (length(N)>1){
    cat(sd(N)/mean(N),"\n")
  }else{
      cat(0,"\n")
  }})
finalvalue1 <- as.numeric(finalvalue1)
sum1 <- sum(finalvalue1)

Fingdings: after running the codes many times, I think the average value of sum1 would be 1.5.

Assignment 1.2

Step 1

Read the file into R

library(readr)
authorlist <- read_csv("~/Desktop/COMM6320_IntroductionR-master/authorlist.csv")
## Parsed with column specification:
## cols(
##   OID = col_integer(),
##   UID = col_integer(),
##   `Author Name` = col_character(),
##   Location = col_character(),
##   College = col_character(),
##   Discipline = col_character(),
##   `Discipline(department)` = col_character(),
##   Email = col_character()
## )

Step 2

Select the 3rd (Author Name) and 6th (Discipline) columns to form a new data frame audisc. After checkig the basic feature of the variable Discipline, I change the value NULL to NA. I also notice there is some typo in the Discipline and I will deal with the name after I split the words.

audisc <- authorlist[,c(3,6)]
library(descr)
freq(audisc$Discipline)

## audisc$Discipline 
##                                                 Frequency  Percent
## Anthropology                                            9   1.7822
## Behavior                                                1   0.1980
## Biology                                                10   1.9802
## Biology,Epigenomics,Medicine                            1   0.1980
## Biology,Informatics                                     2   0.3960
## Biology,Medicine,Informatics                            5   0.9901
## Biology,Social Science                                  1   0.1980
## Biology,Statistics                                      4   0.7921
## Biology,Statistics, Epidemiology,Medicine               1   0.1980
## Biology,Statistics,Disease                              1   0.1980
## Biology,Zoology,Medicine                                1   0.1980
## Business                                                5   0.9901
## Communication                                           4   0.7921
## Computer Science                                       29   5.7426
## Computer Science,Communication                          1   0.1980
## Computer Science,Engineering                            3   0.5941
## Computer Science,Epidemiology                           1   0.1980
## Computer Science,Health                                 1   0.1980
## Computer Science,Medicine                               1   0.1980
## Computer science                                        2   0.3960
## Computer science,Engineering                            2   0.3960
## Computer,Math                                           1   0.1980
## Disease                                                30   5.9406
## Disease;Agriculture                                     1   0.1980
## Diseases                                                8   1.5842
## Economics                                               1   0.1980
## Economics,Public Health,Medicine                        1   0.1980
## Education                                               1   0.1980
## Engineering                                            48   9.5050
## Engineering,Computer Science                            7   1.3861
## Engineering,Health,Medicine                             1   0.1980
## Engineering,Medicine                                    1   0.1980
## Environment                                             4   0.7921
## Environment,Health                                      2   0.3960
## Epidemiology                                           25   4.9505
## Epidemiology,Biology,Statistics                         1   0.1980
## Epidemiology,Health                                     5   0.9901
## Epidemiology,Health,Social Science                      1   0.1980
## Geography                                               4   0.7921
## Health                                                 73  14.4554
## Health,Management                                       1   0.1980
## Health,Math                                             2   0.3960
## Health,Medicine                                         3   0.5941
## Health,Physics,Math,Computer                            1   0.1980
## Health,Statistics                                       1   0.1980
## Informatics                                            10   1.9802
## Informatics,Medicine                                    5   0.9901
## Informatics,Medicine,Epidemiology,Biostatistics         1   0.1980
## Journalism                                              1   0.1980
## Knowledge Structure                                     1   0.1980
## Knowledge Structure,Science                             1   0.1980
## Language                                                3   0.5941
## Language,Computer Science                               5   0.9901
## Management                                              9   1.7822
## Management,Medicine                                     1   0.1980
## Math                                                    5   0.9901
## Math,Computer,Epidemiology                              1   0.1980
## Math,Computer,Social Science                            1   0.1980
## Math,Statistics                                         3   0.5941
## Medicine                                               79  15.6436
## Medicine,Agriculture                                    2   0.3960
## Medicine,Computer Science                               1   0.1980
## Medicine,Disease                                        2   0.3960
## Medicine,Engineering                                    3   0.5941
## Medicine,Epidemiology,Biology,Statistics                1   0.1980
## Medicine,Health                                         8   1.5842
## Medicine,Informatics                                    3   0.5941
## Medicine,Social Sciences                                1   0.1980
## NULL                                                   23   4.5545
## Operations Research                                     1   0.1980
## Politics                                                1   0.1980
## Psychology                                              3   0.5941
## Psychology,Medicine                                     1   0.1980
## Science                                                 4   0.7921
## Sciences,Health                                         1   0.1980
## Social Science                                          3   0.5941
## Social Science,Anthropology                             1   0.1980
## Statistics                                              5   0.9901
## Statistics,Disease                                      3   0.5941
## Veterinary Medicine                                     1   0.1980
## informatics                                             2   0.3960
## informatics,Computer science                            1   0.1980
## Total                                                 505 100.0000
audisc$Discipline[audisc$Discipline == "NULL"] <- NA

Step 3

Replace “;” to “,”, and split the variable Discpline with “,”.

audisc$Discipline <- gsub(";",",",audisc$Discipline)
Dis_split <- sapply(audisc$Discipline, function(x)strsplit(x,split = ","))

Step 4

I have to find out the maximum length of the values in the variable Dis_Split.

leg <- sapply(Dis_split,function(x)length(x))
max(leg)
## [1] 4

Step 5

Extract the first word from the values

Dis_1 <- sapply(Dis_split,function(x)x[[1]])

Step 6

Extract the second, third and forth word from the values

Dis_2 <- sapply(Dis_split,function(x) ifelse(length(x)>1, x[[2]],NA))
Dis_3 <- sapply(Dis_split,function(x) ifelse(length(x)>2, x[[3]],NA))
Dis_4 <- sapply(Dis_split,function(x) ifelse(length(x)>3, x[[4]],NA))

Step 7

Change the data stracture of Dis_1, Dis_2, Dis_3 and Dis_4 into to characer

Dis_1 <- as.character(Dis_1)
Dis_2 <- as.character(Dis_2)
Dis_3 <- as.character(Dis_3)
Dis_4 <- as.character(Dis_4)

Step 8

Recode similar discipines into the united one. e.g., Computer science to Computer Science; Disease to Diseases.

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
Dis_1 <- recode(Dis_1, "Computer science"="Computer Science","Computer"="Computer Science","Disease"="Diseases","Science"="Sciences","informatics"="Informatics")
Dis_2 <- recode(Dis_2, "Computer science"="Computer Science","Computer"="Computer Science","Disease"="Diseases","Social Science"="Social Sciences")
Dis_3 <- recode(Dis_3, " Epidemiology"="Epidemiology","Disease"="Diseases","Social Science"="Social Sciences")
Dis_4 <- recode(Dis_4,"Computer"="Computer Science")

Step 9

Create data frames combining author names with dispines to calculate the frequency.

Disau1 <- data.frame(audisc$`Author Name`,Dis_1)
Disau1$freq[!is.na(Disau1$Dis_1)] <- 1
Disau2 <- data.frame(audisc$`Author Name`,Dis_2)
Disau2$freq[!is.na(Disau2$Dis_2)] <- 1
Disau3 <- data.frame(audisc$`Author Name`,Dis_3)
Disau3$freq[!is.na(Disau3$Dis_3)] <- 1
Disau4 <- data.frame(audisc$`Author Name`,Dis_4)
Disau4$freq[!is.na(Disau4$Dis_4)] <- 1

Step 10

Reshape the 4 data frames to matrix showing the relationship between author name and discipline.

library(reshape)
## 
## Attaching package: 'reshape'
## The following object is masked from 'package:dplyr':
## 
##     rename
Disau1 <- cast(Disau1, Dis_1~audisc..Author.Name.,sum)
## Using freq as value column.  Use the value argument to cast to override this choice
Disau2 <- cast(Disau2, Dis_2~audisc..Author.Name.,sum)
## Using freq as value column.  Use the value argument to cast to override this choice
Disau3 <- cast(Disau3, Dis_3~audisc..Author.Name.,sum)
## Using freq as value column.  Use the value argument to cast to override this choice
Disau4 <- cast(Disau4, Dis_4~audisc..Author.Name.,sum)
## Using freq as value column.  Use the value argument to cast to override this choice

Step 11

Remove all the missing values.

Disau1 <- subset(Disau1,(!is.na(Disau1[,1])))
Disau2 <- subset(Disau2,(!is.na(Disau2[,1])))
Disau3 <- subset(Disau3,(!is.na(Disau3[,1])))
Disau4 <- subset(Disau4,(!is.na(Disau4[,1])))

Step 12

Rename the first column of each matrix for later merging.

colnames(Disau1)[1] <- "Discipline"
colnames(Disau2)[1] <- "Discipline"
colnames(Disau3)[1] <- "Discipline"
colnames(Disau4)[1] <- "Discipline"

Step 13

Merge the four data frames by rows.

Dis_matrix <- rbind(Disau1,Disau2,Disau3,Disau4)
Dis_matrix <- aggregate(.~Discipline, Dis_matrix,sum)

Step 14

Set the discipline name as header.

rownames(Dis_matrix) <- Dis_matrix[,1]
Dis_matrix <- Dis_matrix[,-1]

Step 15

Create the final co-occurence matrix and change the data structure into numeric.

Dis_matrix <- data.matrix(Dis_matrix)
Dis_matrix[is.na(Dis_matrix)]=0
Dis_matrix_tran <- t(Dis_matrix)
Dis_occ <- Dis_matrix %*% Dis_matrix_tran