from pybliometrics.scopus import ScopusSearch,AbstractRetrieval,AuthorRetrieval
# import time
import pandas as pd
import numpy as np
import re
#--------------------------------------------------------------------
# Definition of functions #
#--------------------------------------------------------------------
# Get_citing_papers for reference paper
[docs]def get_citing_papers(reference_paper_eid):
"""
This function returns a list of citing papers for a given paper.
INPUT:
- reference_paper_eid: the eid of paper in Scopus e.g 2-s2.0-85101235827
OUTPUT:
- citing_papers: citing papers of given eid
"""
# Create a query
query = f"REF({reference_paper_eid})"
try: # try to query through ScopusSearch from pybliometrics
# Search in scopus using a query
s = ScopusSearch(query)
# Extract data on papers , citing the reference paper
citing_papers = s.results
except : # if any problem with the ScopusSearch from pybliometrics
# Assume citing_papers as 0
citing_papers=0
return citing_papers
# Get_cited_papers for given reference paper
[docs]def get_cited_papers(reference_paper_eid):
"""
This function returns a list of references for a given paper.
INPUT:
- reference_paper_eid: the eid of paper in Scopus e.g 2-s2.0-85101235827
OUTPUT:
- cited_papers: references of given paper's eid
"""
# Extract data on Abstact of reference paper
ss = AbstractRetrieval(reference_paper_eid, view='FULL')
# Extract the reference list of given paper (reference_paper_eid)
cited_papers = ss.references
return cited_papers
# Get eids for citing or cited paper
[docs]def get_EIDS(paper_object,publications_outside_scopus):
"""
This function returns a list of eids for a given paper according
to Scopus database. If paper is not in Scopus, then it is saved
into publications_outside_scopus.
INPUT:
- paper_object: information extracted from query using a ScopusSearch
- publications_outside_scopus - a list for saving the publications which are not available in Scopus
OUTPUT:
- eids_list: list of eids in Scopus database
"""
# Create an empty list (for filling in)
eids_list=[]
# Checking some cases of paper_object
if paper_object is None:
pass # do nothing
elif paper_object==0:
pass # do nothing
elif len(paper_object)==0:
pass # do nothing
else:
# Extract eids from paper object
for paper in range(len(paper_object)):
if hasattr(paper_object[paper], 'eid'): # if eid exists
# Extract eids for given paper
eids_list.append(paper_object[paper].eid)
else: # if eid does not exist then extract id (same as eid but without "2-s2.0-"")
if paper_object[paper].id is not None:
# Append the paper's id to eids_list
eids_list.append("2-s2.0-"+str(paper_object[paper].id))
else:
# Append the paper name to the publications_outside_scopus
publications_outside_scopus.append(str(paper_object[paper].fulltext))
return eids_list
[docs]def get_paper_population(eids_list,paper_population):
"""
This function check if papers from eids_list already exist in a
population of papers (from Scopus). If yes, the code returns the
exisitng population. If not, the code adds the papers into
existing population and returns it.
INPUT:
- eids_list: a list of papers - candidates for adding into a population
- paper_population: the current population of papers on a given topic
OUTPUT:
- paper_population: updated (or current) population of papers on
a given topic
"""
if eids_list!=0: # eids_list is not a zero
# Append eids_list to paper population
for eid in range(len(eids_list)): # for each eid from eids_list
if eids_list[eid] in paper_population: # if eid is already in population
pass # do nothing
else: # eids_list[eid] is not in population
# Add eids_list[eid] into population
paper_population.append(eids_list[eid])
else: # if eids_list==0
pass # do nothing
return paper_population
[docs]def check_related_articles(eid_list,keywords,publications_with_errors):
"""
This function extracts only papers corresponding to the given topic.
The criterion for keeping or not the paper is a presence of
predefined keywords in a paper's title, abstract or among author
keywords.
INPUT:
- eids_list: a list of papers for processing on keywords
- keywords: user defined keywords
- publications_with_errors: list where a publication is saved if
any error occurs during the processing
OUTPUT:
- eid_list: a list of kept papers corresponding to the given topic
"""
# Prepare a zero vector (used later as one of the function output)
decision_vector=np.zeros(len(eid_list)) # 0 - drop the paper ; 1 - keep the paper
# Checking the papers eids from the given eid_list
for idx,paper_eid in enumerate(eid_list): # for each paper eid
# Checking the abstract by keywords
try:
# Extract data on Abstact of given paper
ss = AbstractRetrieval(paper_eid, view='FULL')
# Checking the matching of keywords in abstract
if hasattr(ss, 'abstract'): # if an abstract exists
paper_abstract=ss.abstract # extract a paper abstract
# Check if any keywords exist in a paper abstract
if any(keyword in paper_abstract for keyword in keywords):
decision_vector[idx]=1 # add 1 into a zero vector
if decision_vector[idx]!=1: # if still zero
# Check keywords in author keywords
if hasattr(ss, 'authkeywords'):# if author keywords exist
paper_keywords=ss.authkeywords # extract author keywords
# Check if any keywords exist in paper_keywords
if paper_keywords is not None and any(keyword in paper_keywords for keyword in keywords):
decision_vector[idx]=1 # add 1 into a zero vector
if decision_vector[idx]!=1: # if still zero
# Check keywords in title
if hasattr(ss, 'title'): # if a title exists
paper_title=ss.title # extract a paper's title
# Check if any keywords exist in title
if any(keyword in paper_title for keyword in keywords):
decision_vector[idx]=1 # add 1 into a zero vector
except Exception as exception: # if any error with AbstractRetrieval occured
# Print in command line the error name and eid of a paper
print(type(exception).__name__,'with ',paper_eid)
# Save the paper eid in a special list (for post analysis)
publications_with_errors.append([type(exception).__name__,'with ',paper_eid])
# Continue
pass
# Deciding on the paper: drop or keep (in a population)
if np.sum(decision_vector)>0: # if there is at least one '1'
# Find the indexes of ones in a decision_vector
ones_indexes = [i for i,val in enumerate(decision_vector) if val==1]
# Create an empty list
intermediate_list=[]
# Extract the eids which corresponds to the status 1 (relevant publication)
for i in range(len(ones_indexes)):
index=ones_indexes[i]
intermediate_list.append(eid_list[index])
# Save intermediate_list as output of the function
eid_list=intermediate_list
else: # if all values are zero in ones_indexes
# Assume eid_list=0
eid_list=0
return eid_list
# Function to retrive the metadata for each paper from scopus populations
[docs]def retrieve_paper_data(paper_population):
"""
This function retrieves the metadata for each paper from a given
population of papers (in Scopus).
INPUT:
- paper_population: a list of eids representing the population of papers
OUTPUT:
- df: a dataframe with several columns (see below)
"""
# Column names
column_names=['eid','title','publicationName','coverDate','refcount','citedby_count','doi']
# Create an empty dataframe
df=pd.DataFrame(columns = column_names)
if paper_population is None: # if paper_population does not have a data
pass # do nothing and return df without data
else: # if population is NOT None
for idx,paper_eid in enumerate(paper_population): # for each paper eid
# Retrieve data for a given paper
paper_data = AbstractRetrieval(paper_eid, view='FULL') # Extract entire metadata on Abstact of reference paper
# Preselect the data for df
retrieved_data=[paper_eid, paper_data.title,paper_data.publicationName,paper_data.coverDate,paper_data.refcount,paper_data.citedby_count,paper_data.doi]
# Create an intermediate dataframe with retreived data for the given paper
df2 = pd.DataFrame([retrieved_data],columns=column_names)
# Concatenate dataframes df and df2
df=pd.concat([df,df2])
return df # return dataframe with retreived data for all papers in population
[docs]def creating_connection_graph(name,paper_population,publications_outside_scopus):
"""
This function creates a network graph of publications and saves it
it in xlsx format.
INPUT:
- paper_population - a population of papers (a list of eids)
- publications_outside_scopus - list of publications outside of scopus (required for get_EIDS)
OUTPUT:
- graph_df.xlsx - excel table, representing the created graph
"""
# Create column names (for an excel table)
columns_name=['primary_list','secondary_list','Direction']
direction_forward=1 # 'primary_secondary' it means the paper from primary list cites the paper from secondary list
direction_backward=2 # 'secondary_primary' it means that the paper from SECONDARY list cites the paper from primary list
# Create an empty dataframe
graph_df=pd.DataFrame(columns=columns_name)
for idx,paper_eid in enumerate(paper_population): # for each paper eid
# Get citing and cited papers for given reference paper
citing_papers=get_citing_papers(paper_eid)
cited_papers=get_cited_papers(paper_eid)
# Get EID of these citing and cited papers
eid_list_citing=get_EIDS(citing_papers,publications_outside_scopus)
eid_list_cited=get_EIDS(cited_papers,publications_outside_scopus)
eid_list_citing = set(eid_list_citing) # convert to set
eid_list_cited=set(eid_list_cited) # convert to set
# Apply intersection to sets and convert to list
eid_list_citing = list(eid_list_citing.intersection(paper_population))
eid_list_cited = list(eid_list_cited.intersection(paper_population))
if len(eid_list_citing)!=0:
# Prepare graph_df for citing articles
intermediate_df=pd.DataFrame(columns=columns_name)
intermediate_df['primary_list']=[paper_eid]*len(eid_list_citing)
intermediate_df['secondary_list']=eid_list_citing
intermediate_df['Direction']=list(str(direction_backward))*len(eid_list_citing)
graph_df=pd.concat([graph_df,intermediate_df])
if len(eid_list_cited)!=0:
# Prepare graph_df for cited articles (references)
intermediate_df=pd.DataFrame(columns=columns_name)
intermediate_df['primary_list']=[paper_eid]*len(eid_list_cited)
intermediate_df['secondary_list']=eid_list_cited
intermediate_df['Direction']=list(str(direction_forward))*len(eid_list_cited)
graph_df=pd.concat([graph_df,intermediate_df])
# Create a filename
filename=name+'_'+'graph_df.xlsx'
# Save to excel
graph_df.to_excel(filename,index=False)
return graph_df
[docs]def calculate_connections_number(graph_df,paper_population):
"""
This function calculates the number of connections per each paper
in a population of papers.
INPUT:
- graph_df: a dataframe generated in the function ploting_connection_graph()
- paper_population: a population of papers (a list of eids)
OUTPUT:
- df: an intermediate column
connections: a dataframe (paper's metadata+ number of connections inside of population)
"""
# Calculate the count (the column for merging later)
df=graph_df.groupby(['primary_list','secondary_list']).size().reset_index().rename(columns={0:'count'})
# Find a number of connections
connections=df.groupby('primary_list').sum()
# Extract metadata for population
population_data=retrieve_paper_data(paper_population)
# Merge into a new dataframe
connections = pd.merge(population_data, connections,left_on='eid', right_on='primary_list')
return connections
[docs]def generate_variations(keywords):
all_variations = [] # Start with an empty list of variations
for keyword in keywords:
# Generate all possible variations for the keyword, including capitalization and pluralization
variations = [keyword, keyword.capitalize()]
if keyword.endswith("s"):
variations.append(keyword[:-1]) # Add singular version if keyword ends with "s"
else:
variations.append(keyword + "s") # Add plural version otherwise
# Add the variations for this keyword to the list of all variations
all_variations.append(variations)
# Generate all possible combinations of the variations for each keyword, without combining words from different keywords
combined_variations = []
for variations in all_variations:
for variation in variations:
combined_variations.append([variation])
# Add variations with "of" between the keywords, including articles after "of"
for i in range(len(keywords)):
for j in range(len(keywords)):
if i != j:
for var1 in all_variations[i]:
for var2 in all_variations[j]:
# Add variations without articles
combined_variations.append([var1, "of", var2])
# Add variations with "a" or "the" after "of"
if j > i: # Only add articles if the second keyword comes after the first
combined_variations.append([var1, "of", "a", var2])
combined_variations.append([var1, "of", "the", var2])
# Join each variation into a single phrase and return the list of variations
variations = [' '.join(variation) for variation in combined_variations]
# Swap the order of the keywords and join with "of", including articles after "of"
for variation in combined_variations:
if len(variation) == 2:
variations.append(' '.join(reversed(variation)) + ' of')
elif len(variation) == 3 and variation[1] == "of":
variations.append(variation[2] + " of " + variation[0])
variations.append("the " + variation[2] + " of " + variation[0])
variations.append("a " + variation[2] + " of " + variation[0])
return variations