Source code for source_code.classes

import numpy as np
import pandas as pd
from source_code import functions

[docs]class research_topic(): """ This class allows creating the object of user-defined topic and extract the relevant papers (thanks to its functions) from Scopus """ # Function for creating the object of research topic def __init__(self,name,reference_paper_eid,keywords): """ This function is run automatically when a new instance of research topic is created. This allows us to create an object INPUTS: - self: convention needed to create a function for the class - name: a nae of research topic (any name defined by user) - reference_paper_eid: a paper's eid (index) in Scopus e.g. '2-s2.0-85085924004' a reference_paper_eid is defined in the main.py - keywords: list of keywords defined by user in the main.py OUTPUTS: - None: __init__ has a None output. But note that the object is created as an output with following data (see below) """ # A research topic is created for following data self.name=name # name of the research topic (any user-defined string) self.reference_paper_eid=reference_paper_eid # e.g. 2-s2.0-85085924004 self.keywords=keywords # list of keywords self.paper_population=[] # empty list (to be filled later) self.publications_outside_scopus=[] # empty list (to be filled later) self.publications_with_errors=[] # empty list (to be filled later) self.number_analyzed_papers=0 # empty list (to be filled later) # Function for conducting the anlysis
[docs] def analyze(self): """ This function does the principal analysis decsribed in the documentation. INPUT: self: an empty object of research topic, created earlier in in the __init__ OUTPUT: self: a filled object of research topic + Some excel files: - Figure.html : an interactive netwrok graph repersenting the paper population - Topic_name_outputs.xlsx: a lsit of papers corresponding to a research topic - graph_df.xlsx : a network graph in excel format. Note that one of columns in graph_df is named as 'Direction' having 1 or 2 values 1 means the paper from primary list cites the paper from secondary list 2 means that the paper from SECONDARY list cites the paper from primary list """ # ----------------------- First stage ------------------------ # ----------------Creating the first population--------------- print('<<< First stage: processing the reference eid >>>') # Get citing papers for given reference paper citing_papers=functions.get_citing_papers(self.reference_paper_eid) # Get cited papers for given reference paper cited_papers=functions.get_cited_papers(self.reference_paper_eid) # Get EID of these citing and cited papers eid_list_citing=functions.get_EIDS(citing_papers,self.publications_outside_scopus) eid_list_cited=functions.get_EIDS(cited_papers,self.publications_outside_scopus) # Keep papers corresponding to our research topic eid_list_citing=functions.check_related_articles(eid_list_citing,self.keywords,self.publications_with_errors) eid_list_cited=functions.check_related_articles(eid_list_cited,self.keywords,self.publications_with_errors) # Add these papers into the paper population. For the first time population is created functions.get_paper_population(eid_list_citing,self.paper_population) functions.get_paper_population(eid_list_cited,self.paper_population) # --------------------- Second stage ------------------------ # --------------Processing each paper in population----------- # Assume that papers were not analyzed yet number_analyzed_papers=0 # Create a list of non-analyzed and analyzed papers non_analyzed_papers=self.paper_population analyzed_papers=[] # empty list errors_count=0 print('<<< Second stage: processing each paper in population >>>') while number_analyzed_papers!=len(self.paper_population): # while we do not analyze every paper in population # Refresh status of errors no_errors_with_citing_papers=1 # 1 means true i.e. no erros exist no_errors_with_cited_papers=1 # 1 means true i.e. no erros exist if number_analyzed_papers==0: # first iteration # Take a first paper from population reference_paper_eid=self.paper_population[0] # Get citing papers for given reference paper citing_papers=functions.get_citing_papers(reference_paper_eid) # Get cited papers for given reference paper cited_papers=functions.get_cited_papers(reference_paper_eid) # Get EID of these citing and cited papers eid_list_citing=functions.get_EIDS(citing_papers,self.publications_outside_scopus) eid_list_cited=functions.get_EIDS(cited_papers,self.publications_outside_scopus) if len(eid_list_citing)>0: # Extract only relevant papers eid_list_citing=functions.check_related_articles(eid_list_citing,self.keywords,self.publications_with_errors) # Add cited papers into population functions.get_paper_population(eid_list_citing,self.paper_population) if len(eid_list_cited)>0: # Extract only relevant papers eid_list_cited=functions.check_related_articles(eid_list_cited,self.keywords,self.publications_with_errors) # Add cited papers into population functions.get_paper_population(eid_list_cited,self.paper_population) print('Population:', len(self.paper_population)) # increase the count of analyzed papers number_analyzed_papers+=1 print('Papers analyzed: ',number_analyzed_papers) analyzed_papers.append(reference_paper_eid) # Remove analyzed paper from non_analyzed_papers non_analyzed_papers.remove(reference_paper_eid) print('Non_analyzed:', len(non_analyzed_papers)) print(' ') # empty line in a command window else: # number_analyzed_papers!=0 i.e. >0 # Take a first paper from non_analyzed_papers reference_paper_eid=non_analyzed_papers[0] try: # Get citing papers for given reference paper citing_papers=functions.get_citing_papers(reference_paper_eid) except: no_errors_with_citing_papers=0 print('Error with citing papers') errors_count+=1 try: # Get cited papers for given reference paper cited_papers=functions.get_cited_papers(reference_paper_eid) except: no_errors_with_cited_papers=0 print('Error with cited papers') errors_count+=1 if no_errors_with_citing_papers==1: if citing_papers!=0: # Get EID of these citing and cited papers eid_list_citing=functions.get_EIDS(citing_papers,self.publications_outside_scopus) # Extract only relevant papers eid_list_citing=functions.check_related_articles(eid_list_citing,self.keywords,self.publications_with_errors) # Add citing papers into population functions.get_paper_population(eid_list_citing,self.paper_population) else: pass if no_errors_with_cited_papers==1: if cited_papers!=0: eid_list_cited=functions.get_EIDS(cited_papers,self.publications_outside_scopus) # Extract only relevant papers eid_list_cited=functions.check_related_articles(eid_list_cited,self.keywords,self.publications_with_errors) # Add cited papers into population functions.get_paper_population(eid_list_cited,self.paper_population) else: pass else: print('Error problem') print('Population:', len(self.paper_population)) # increase the count number_analyzed_papers+=1 print('Papers analyzed: ',number_analyzed_papers,'or ',round(number_analyzed_papers/len(self.paper_population)*100,2),' %') # Add to the list of analyzed papers analyzed_papers.append(reference_paper_eid) # Update non-analyzed papers with consideration of newly populated papers non_analyzed_papers = list(set(self.paper_population).difference(analyzed_papers)) print('Non_analyzed:', len(non_analyzed_papers)) print(' ') # -------------------------- Third stage ---------------------------- # --------------------- Postprocessing of results -------------------- print('<<< Third stage: postprocessing of results >>>') # Ploting the graph of paper population (with saving as excel) graph_df=functions.creating_connection_graph(self.name,self.paper_population,self.publications_outside_scopus) # Calculate the number of connections connections=functions.calculate_connections_number(graph_df,self.paper_population) # Save an output connections.to_excel(self.name+'_'+'outputs.xlsx') # Print that the analysis is finsihed print('<<<< Analysis is finished >>>>') return self
[docs] def plot_network_graph(self): """ This function plots the interactive graph showing how publication are interrelated INPUT: self: an filled object of research topic after the function analyze() was used OUTPUT: - Figure.html : an interactive netwrok graph repersenting the paper population """ # import neccesary packages from pathlib import Path import networkx as nx from bokeh.io import show from bokeh.models import Range1d, Circle, MultiLine from bokeh.plotting import figure from bokeh.plotting import from_networkx if self.paper_population: # if this list is NOT empty then plot the figure # Find the filename graph_df.xlsx filename=self.name+'_'+'graph_df.xlsx' # Check the path of file in current directory path=Path(filename) if path.is_file(): # if file exists # Read the graph_df.xlsx graph_df=pd.read_excel(filename) # Create a G graph between restaurants and customers G=nx.from_pandas_edgelist(graph_df, target='primary_list', source='secondary_list') # #Choose a title title = 'Interconnection of papers in their population' #Establish which categories will appear when hovering over each node HOVER_TOOLTIPS = [("Scopus eid", "@index")] #Create a plot — set dimensions, toolbar, and title plot = figure(width=1400, height=700,tooltips = HOVER_TOOLTIPS, tools="pan,wheel_zoom,save,reset", active_scroll='wheel_zoom', x_range=Range1d(-10.1, 10.1), y_range=Range1d(-10.1, 10.1), title=title) network_graph = from_networkx(G, nx.spring_layout, scale=10, center=(0, 0)) #Set node size and color network_graph.node_renderer.glyph = Circle(size=7, fill_color='skyblue') #Set edge opacity and width network_graph.edge_renderer.glyph = MultiLine(line_alpha=0.5, line_width=1) #Add network graph to the plot plot.renderers.append(network_graph) # Show a plot show(plot) else: # if file "graph_df" does not exist print(f'The file {filename} does not exist') else: # if the list is empty print('The paper population is empty. Use analyze() first to get a paper population') pass # do nothing