Text Analysis:Bibliometrix

From PRGs Wiki
Jump to navigation Jump to search

Page for various tools for the processing of bibliometrix data, these tools were originally developed for a project that was looking into academic communities by exploring search results from Web of Knowledge, at the time specifically Complex Thinking. Most of the tools are built in Neo4J and R. The programs build a network of papers and the work they cite.

The first bit of code that is useful is for turning data loaded from Web of Science into the R Bibliometrix package to a CSV file that is useful for then loading into

library(bibliometrix)
library(igraph)
library(ggplot2)

#reads in the exported
D <- readFiles("bib_export.bib")
#convert it into a matrix
M <- convert2df(D, dbsource = "isi", format = "bibtex")

#pull out the bits of the data that we want, could add more in here.
cited=M$CR
author=M$AU
key = M$SR
title = M$TI
data = cbind(key,title,author,cited)

#build a data frame for the export, there must be a better way of doing this.
E_List <- NULL;
for(i in 1:length(data[,1])){
  cit_list = unlist(strsplit(data[i,4], split=".   "))
  for(s in 1:length(cit_list))
  {
    E_List <- rbind(E_List, c(data[i,1],cit_list[s]))
  }
}

#save the data
write.csv2(E_List, file = "NetExport.txt", row.names = FALSE)

That produces a csv file that Ne04J can load using the following command.

LOAD CSV FROM "file:/NetExport.txt"  AS line FIELDTERMINATOR ';' MATCH (a:Paper),(b:Reference) WHERE a.name = line[0] AND b.name = line[1] MERGE (a)-[r:Cites]->(b) RETURN r

That loads the data into a Neo4J database, that could probably be done in one step. I will probably update this later. Next we can do things with the network in R using iGraph.

library(igraph);
library(RNeo4j); #This needs to be installed with devtools to get a new enough version to interface with Neo4J

graph = startGraph("http://localhost:7474/db/data/", username = "neo4j", password= "Nufoa23")

papersQuery = "MATCH (p:Paper) RETURN id(p) AS id, p.name AS pName, labels(p)";
refsQuery = "MATCH (r:Reference) RETURN id(r) AS id, r.name AS Name, labels(r)";

papers = cypher(graph, papersQuery)
colnames(papers) = c("ID","Name","Type")
references = cypher(graph, refsQuery)
colnames(references) = c("ID","Name","Type")
nodes = rbind(papers,references)

#Edit the whole graph
wholeGraphQ = "MATCH (p:Paper)-[r:Cites]->(s:Reference) RETURN id(p) AS pID, id(s) AS sID"
relations = cypher(graph, wholeGraphQ)
wG = graph.data.frame(relations,directed=TRUE,nodes)
#V(wG)$label.cex <- 0.5
V(wG)$color <- ifelse(V(wG)$Type == "Paper", "lightblue", "orange")
V(wG)$shape <- ifelse(V(wG)$Type == "Reference", "square", "circle")

area = vcount(wG)^2

co <- layout_with_fr(wG, grid=c("nogrid"))

#save the whole graph
pdf("~/Documents/wGraph.pdf",10,10)
plot(wG, layout=co, vertex.size=1, edge.arrow.size=0.3, vertex.label="")
dev.off()

#Papers by the cited works, weighted network
papersByRefs = "MATCH path=(n:Paper)-->(d:Reference)<--(m:Paper) WHERE NOT id(n) = id(m)  AND id(n) < id(m) RETURN n.name AS Paper1, m.name AS Paper2, count(d) AS Weight"

pByRefRels  = cypher(graph, papersByRefs)
prG = graph.data.frame(pByRefRels,directed = FALSE)
area = vcount(prG)^2


co <- layout_with_fr(prG, grid=c("nogrid"))

papByRClust = cluster_fast_greedy(prG, merges = TRUE, modularity = TRUE,
  membership = TRUE, weights = E(prG)$weight)
V(prG)$color <- papByRClust$membership + 1

PapCl_out = cbind(V(prG)$name,papByRClust$membership)

write_graph(prG, file ="~/Documents/PapersNet.graphml", format = c("graphml"))
write.csv2(cl_out, file ="~/Documents/PapersClusters.txt")

#Save paper-paper graph as pdf
pdf("~/Documents/ppGraph.pdf",10,10)
plot(clusters, prG, layout=co, vertex.size=2, edge.arrow.size=0.3, vertex.label="")
dev.off()

#build references by papers weighted
refsByPapers="MATCH path=(r1:Reference)<--(p:Paper)-->(r2:Reference) WHERE NOT id(r1) = id(r2)  AND id(r1) < id(r2) RETURN r1.name AS Ref1, r2.name AS Ref2, count(p) AS Weight"

rByPapRels  = cypher(graph, refsByPapers)
refG = graph.data.frame(rByPapRels,directed = FALSE)
area = vcount(refG)^2


co <- layout_with_fr(refG, grid=c("nogrid"))

refByPClust = cluster_fast_greedy(refG, merges = TRUE, modularity = TRUE,
  membership = TRUE, weights = E(refG)$weight)
V(refG)$color <- refByPClust$membership + 1

RefCl_out = cbind(V(refG)$name,refByPClust$membership)

write_graph(refG, file ="~/Documents/RefsNet.graphml", format = c("graphml"))
write.csv2(RefCl_out, file ="~/Documents/RefClusters.txt")

#Save Ref-Ref graph as pdf
pdf("~/Documents/rrGraph.pdf",10,10)
plot(refG, layout=co, vertex.size=2, edge.arrow.size=0.3, vertex.label="")
dev.off()