Source code for source_code.functions

from pybliometrics.scopus import ScopusSearch,AbstractRetrieval,AuthorRetrieval
# import time
import pandas as pd 
import numpy as np 
import re

#--------------------------------------------------------------------
                    # Definition of functions # 
#--------------------------------------------------------------------

# Get_citing_papers for reference paper
[docs]def get_citing_papers(reference_paper_eid): """ This function returns a list of citing papers for a given paper. INPUT: - reference_paper_eid: the eid of paper in Scopus e.g 2-s2.0-85101235827 OUTPUT: - citing_papers: citing papers of given eid """ # Create a query query = f"REF({reference_paper_eid})" try: # try to query through ScopusSearch from pybliometrics # Search in scopus using a query s = ScopusSearch(query) # Extract data on papers , citing the reference paper citing_papers = s.results except : # if any problem with the ScopusSearch from pybliometrics # Assume citing_papers as 0 citing_papers=0 return citing_papers
# Get_cited_papers for given reference paper
[docs]def get_cited_papers(reference_paper_eid): """ This function returns a list of references for a given paper. INPUT: - reference_paper_eid: the eid of paper in Scopus e.g 2-s2.0-85101235827 OUTPUT: - cited_papers: references of given paper's eid """ # Extract data on Abstact of reference paper ss = AbstractRetrieval(reference_paper_eid, view='FULL') # Extract the reference list of given paper (reference_paper_eid) cited_papers = ss.references return cited_papers
# Get eids for citing or cited paper
[docs]def get_EIDS(paper_object,publications_outside_scopus): """ This function returns a list of eids for a given paper according to Scopus database. If paper is not in Scopus, then it is saved into publications_outside_scopus. INPUT: - paper_object: information extracted from query using a ScopusSearch - publications_outside_scopus - a list for saving the publications which are not available in Scopus OUTPUT: - eids_list: list of eids in Scopus database """ # Create an empty list (for filling in) eids_list=[] # Checking some cases of paper_object if paper_object is None: pass # do nothing elif paper_object==0: pass # do nothing elif len(paper_object)==0: pass # do nothing else: # Extract eids from paper object for paper in range(len(paper_object)): if hasattr(paper_object[paper], 'eid'): # if eid exists # Extract eids for given paper eids_list.append(paper_object[paper].eid) else: # if eid does not exist then extract id (same as eid but without "2-s2.0-"") if paper_object[paper].id is not None: # Append the paper's id to eids_list eids_list.append("2-s2.0-"+str(paper_object[paper].id)) else: # Append the paper name to the publications_outside_scopus publications_outside_scopus.append(str(paper_object[paper].fulltext)) return eids_list
[docs]def get_paper_population(eids_list,paper_population): """ This function check if papers from eids_list already exist in a population of papers (from Scopus). If yes, the code returns the exisitng population. If not, the code adds the papers into existing population and returns it. INPUT: - eids_list: a list of papers - candidates for adding into a population - paper_population: the current population of papers on a given topic OUTPUT: - paper_population: updated (or current) population of papers on a given topic """ if eids_list!=0: # eids_list is not a zero # Append eids_list to paper population for eid in range(len(eids_list)): # for each eid from eids_list if eids_list[eid] in paper_population: # if eid is already in population pass # do nothing else: # eids_list[eid] is not in population # Add eids_list[eid] into population paper_population.append(eids_list[eid]) else: # if eids_list==0 pass # do nothing return paper_population
# Function to retrive the metadata for each paper from scopus populations
[docs]def retrieve_paper_data(paper_population): """ This function retrieves the metadata for each paper from a given population of papers (in Scopus). INPUT: - paper_population: a list of eids representing the population of papers OUTPUT: - df: a dataframe with several columns (see below) """ # Column names column_names=['eid','title','publicationName','coverDate','refcount','citedby_count','doi'] # Create an empty dataframe df=pd.DataFrame(columns = column_names) if paper_population is None: # if paper_population does not have a data pass # do nothing and return df without data else: # if population is NOT None for idx,paper_eid in enumerate(paper_population): # for each paper eid # Retrieve data for a given paper paper_data = AbstractRetrieval(paper_eid, view='FULL') # Extract entire metadata on Abstact of reference paper # Preselect the data for df retrieved_data=[paper_eid, paper_data.title,paper_data.publicationName,paper_data.coverDate,paper_data.refcount,paper_data.citedby_count,paper_data.doi] # Create an intermediate dataframe with retreived data for the given paper df2 = pd.DataFrame([retrieved_data],columns=column_names) # Concatenate dataframes df and df2 df=pd.concat([df,df2]) return df # return dataframe with retreived data for all papers in population
[docs]def creating_connection_graph(name,paper_population,publications_outside_scopus): """ This function creates a network graph of publications and saves it it in xlsx format. INPUT: - paper_population - a population of papers (a list of eids) - publications_outside_scopus - list of publications outside of scopus (required for get_EIDS) OUTPUT: - graph_df.xlsx - excel table, representing the created graph """ # Create column names (for an excel table) columns_name=['primary_list','secondary_list','Direction'] direction_forward=1 # 'primary_secondary' it means the paper from primary list cites the paper from secondary list direction_backward=2 # 'secondary_primary' it means that the paper from SECONDARY list cites the paper from primary list # Create an empty dataframe graph_df=pd.DataFrame(columns=columns_name) for idx,paper_eid in enumerate(paper_population): # for each paper eid # Get citing and cited papers for given reference paper citing_papers=get_citing_papers(paper_eid) cited_papers=get_cited_papers(paper_eid) # Get EID of these citing and cited papers eid_list_citing=get_EIDS(citing_papers,publications_outside_scopus) eid_list_cited=get_EIDS(cited_papers,publications_outside_scopus) eid_list_citing = set(eid_list_citing) # convert to set eid_list_cited=set(eid_list_cited) # convert to set # Apply intersection to sets and convert to list eid_list_citing = list(eid_list_citing.intersection(paper_population)) eid_list_cited = list(eid_list_cited.intersection(paper_population)) if len(eid_list_citing)!=0: # Prepare graph_df for citing articles intermediate_df=pd.DataFrame(columns=columns_name) intermediate_df['primary_list']=[paper_eid]*len(eid_list_citing) intermediate_df['secondary_list']=eid_list_citing intermediate_df['Direction']=list(str(direction_backward))*len(eid_list_citing) graph_df=pd.concat([graph_df,intermediate_df]) if len(eid_list_cited)!=0: # Prepare graph_df for cited articles (references) intermediate_df=pd.DataFrame(columns=columns_name) intermediate_df['primary_list']=[paper_eid]*len(eid_list_cited) intermediate_df['secondary_list']=eid_list_cited intermediate_df['Direction']=list(str(direction_forward))*len(eid_list_cited) graph_df=pd.concat([graph_df,intermediate_df]) # Create a filename filename=name+'_'+'graph_df.xlsx' # Save to excel graph_df.to_excel(filename,index=False) return graph_df
[docs]def calculate_connections_number(graph_df,paper_population): """ This function calculates the number of connections per each paper in a population of papers. INPUT: - graph_df: a dataframe generated in the function ploting_connection_graph() - paper_population: a population of papers (a list of eids) OUTPUT: - df: an intermediate column connections: a dataframe (paper's metadata+ number of connections inside of population) """ # Calculate the count (the column for merging later) df=graph_df.groupby(['primary_list','secondary_list']).size().reset_index().rename(columns={0:'count'}) # Find a number of connections connections=df.groupby('primary_list').sum() # Extract metadata for population population_data=retrieve_paper_data(paper_population) # Merge into a new dataframe connections = pd.merge(population_data, connections,left_on='eid', right_on='primary_list') return connections
[docs]def generate_variations(keywords): all_variations = [] # Start with an empty list of variations for keyword in keywords: # Generate all possible variations for the keyword, including capitalization and pluralization variations = [keyword, keyword.capitalize()] if keyword.endswith("s"): variations.append(keyword[:-1]) # Add singular version if keyword ends with "s" else: variations.append(keyword + "s") # Add plural version otherwise # Add the variations for this keyword to the list of all variations all_variations.append(variations) # Generate all possible combinations of the variations for each keyword, without combining words from different keywords combined_variations = [] for variations in all_variations: for variation in variations: combined_variations.append([variation]) # Add variations with "of" between the keywords, including articles after "of" for i in range(len(keywords)): for j in range(len(keywords)): if i != j: for var1 in all_variations[i]: for var2 in all_variations[j]: # Add variations without articles combined_variations.append([var1, "of", var2]) # Add variations with "a" or "the" after "of" if j > i: # Only add articles if the second keyword comes after the first combined_variations.append([var1, "of", "a", var2]) combined_variations.append([var1, "of", "the", var2]) # Join each variation into a single phrase and return the list of variations variations = [' '.join(variation) for variation in combined_variations] # Swap the order of the keywords and join with "of", including articles after "of" for variation in combined_variations: if len(variation) == 2: variations.append(' '.join(reversed(variation)) + ' of') elif len(variation) == 3 and variation[1] == "of": variations.append(variation[2] + " of " + variation[0]) variations.append("the " + variation[2] + " of " + variation[0]) variations.append("a " + variation[2] + " of " + variation[0]) return variations