/*
* FindRelated
* Copyright (c) 2003-2011 Stellman & Greene Consulting
* Developed for Joshua Zivin and Pierre Azoulay, Columbia University
* http://www.stellman-greene.com/PublicationHarvester
*
* This program is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
* Foundation; either version 2 of the License, or (at your option) any later
* version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with
* this program (GPL.txt); if not, write to the Free Software Foundation, Inc., 51
* Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
using System;
using System.Collections.Generic;
using System.Text;
using System.Xml;
using System.Net;
using System.IO;
using System.Data;
using System.Diagnostics;
using Com.StellmanGreene.PubMed;
namespace Com.StellmanGreene.FindRelated
{
internal class RelatedFinder
{
struct RankAndScore
{
public int Rank { get; set; }
public int Score { get; set; }
}
const string ELINK_URL = "http://www.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi";
const string ELINK_DB = "pubmed";
const string ELINK_DBFROM = "pubmed";
private NCBI ncbi = new NCBI("medline");
public System.ComponentModel.BackgroundWorker BackgroundWorker { get; set; }
///
/// Execute the FindRelated search, create and populate the tables
///
/// ODBC DSN to access the SQL server
/// Name of the FindRelated SQL table to create
/// FileInfo object with information about the input CSV file
/// PublicationFilter object to use for filtering publications
/// True if resuming a previous run
/// True if in "lite" mode, where it only runs the FindRelated search and does not do additional processing
/// Output filename for "lite" mode (ignored when not in "lite" mode)
public void Go(string odbcDsn, string relatedTableName, FileInfo inputFileInfo, PublicationFilter publicationFilter, bool resume, bool liteMode, string liteModeOutputFile)
{
Database db = new Database(odbcDsn);
string queueTableName = relatedTableName + "_queue";
string extremeRelevanceTableName = relatedTableName + "_extremerelevance";
InputQueue inputQueue;
if (!resume)
{
if (liteMode && !CreateLiteModeOutputFile(liteModeOutputFile))
return;
CreateTables(db, relatedTableName, queueTableName, extremeRelevanceTableName, liteMode);
inputQueue = new InputQueue(inputFileInfo, db, queueTableName);
}
else
{
inputQueue = new InputQueue(db, queueTableName);
}
int setnbCount = 0;
while (inputQueue.Next())
{
BackgroundWorker.ReportProgress((100 * setnbCount) / inputQueue.Count);
Trace.WriteLine(DateTime.Now + " - querying for related articles for setnb " + inputQueue.CurrentSetnb + " (" + ++setnbCount + " of " + inputQueue.Count + ")");
// Do the linked publication search for the author's PMIDs and process the results.
// This returns a Dictionary that maps author publications (from the PeoplePublications table)
// to linked publications, so each key is one of the author publications read from the DB originally.
string xml = ExecuteRelatedSearch(inputQueue.CurrentPmids);
Dictionary> relatedRanks;
Dictionary> relatedSearchResults = GetIdsFromXml(xml, out relatedRanks);
bool completed;
if (liteMode)
{
Trace.WriteLine(DateTime.Now + " - found " + relatedSearchResults.Count + " PMIDs for setnb " + inputQueue.CurrentSetnb);
completed = false;
completed = WriteRelatedRanksToOutputFileAndDatabaseForLiteMode(db, relatedTableName, relatedSearchResults, relatedRanks, liteModeOutputFile, inputQueue);
if (!completed) // WriteRelatedRankToOutputFile() returns false if the user cancelled the operation
break;
}
else
{
int total = 0;
foreach (int key in relatedSearchResults.Keys)
total += relatedSearchResults[key].Count;
Trace.WriteLine(DateTime.Now + " - found " + total + " related to " + relatedSearchResults.Keys.Count + " publications");
completed = ProcessSearchResults(db, relatedTableName, publicationFilter, extremeRelevanceTableName, relatedRanks, relatedSearchResults, inputQueue);
if (!completed) // ProcessSearchResults() returns false if the user cancelled the operation
break;
}
}
BackgroundWorker.ReportProgress(100);
}
///
/// Create the "lite" mode output file
///
///
private bool CreateLiteModeOutputFile(string liteModeOutputFile)
{
try
{
if (File.Exists(liteModeOutputFile))
{
string outputFileName = Path.GetFileName(liteModeOutputFile);
if (File.Exists(liteModeOutputFile + ".bak"))
{
Trace.WriteLine(DateTime.Now + " - deleting old \"lite\" mode output .bak file '" + outputFileName + ".bak'");
File.Delete(liteModeOutputFile + ".bak");
}
Trace.WriteLine(DateTime.Now + " - renaming old \"lite\" mode output file '" + outputFileName + "' to '" + outputFileName + ".bak'");
File.Move(liteModeOutputFile, liteModeOutputFile + ".bak");
}
string header = "pmid,rltd_pmid,rltd_rank,rltd_score" + Environment.NewLine;
File.WriteAllText(liteModeOutputFile, header);
}
catch (Exception ex)
{
Trace.WriteLine(DateTime.Now + " - unable to create the \"lite\" mode output file: " + ex.Message);
Trace.WriteLine(ex.StackTrace);
}
return true;
}
///
/// Go through all of the ranks and scores retrieved from the server for each PMID and write them to the output file and the database.
/// This is used by the 'lite' mode.
///
/// Database to write to
/// Name of the related table
/// NCBI search results parsed into a dictionary that maps queried PMIDs to a list of related PMIDs
/// Dictionary parsed from NCBI search results that maps each queried PMID to a dictionary of related PMIDs and their ranks and scores
/// Output file to append to
/// Input queue for marking success or error
/// True if a lines were successfully added to the file and table, false if an error occurred
private bool WriteRelatedRanksToOutputFileAndDatabaseForLiteMode(Database db, string relatedTableName,
Dictionary> relatedSearchResults, Dictionary> relatedRanks,
string liteModeOutputFile, InputQueue inputQueue)
{
if (BackgroundWorker != null && BackgroundWorker.CancellationPending)
{
Trace.WriteLine(DateTime.Now + " - cancelled");
return false;
}
foreach (int pmid in relatedSearchResults.Keys)
{
List relatedPmids = relatedSearchResults[pmid];
if (relatedPmids == null)
Trace.WriteLine(DateTime.Now + " - found empty related PMID list for PMID " + pmid);
else if (!relatedRanks.ContainsKey(pmid))
Trace.WriteLine(DateTime.Now + " - no ranks or scores found for PMID " + pmid);
else
{
Dictionary ranksAndScores = relatedRanks[pmid];
foreach (int relatedPmid in relatedPmids)
{
if (!ranksAndScores.ContainsKey(relatedPmid))
Trace.WriteLine(DateTime.Now + " - unable to find related ranks and scores for PMID " + pmid + ", related PMID " + relatedPmid);
else
{
RankAndScore rankAndScore = ranksAndScores[relatedPmid];
string line = String.Format("{0},{1},{2},{3}", pmid, relatedPmid, rankAndScore.Rank, rankAndScore.Score);
string output = line + Environment.NewLine;
try
{
File.AppendAllText(liteModeOutputFile, output);
}
catch (Exception ex)
{
Trace.WriteLine(DateTime.Now + " - unable to append '" + line + "' to the \"lite\" mode output file: " + ex.Message);
Trace.WriteLine(ex.StackTrace);
Trace.WriteLine(DateTime.Now + " - cancelling the run, use the Resume button to resume");
inputQueue.MarkError(pmid);
return false;
}
bool written = WriteRelatedRankToDatabase(db, relatedTableName, pmid, relatedPmid, rankAndScore.Rank, rankAndScore.Score);
if (!written)
return false;
}
}
}
// Mark the PMID processed in the queue
inputQueue.MarkProcessed(pmid);
}
return true;
}
///
/// For each of the author's publications in the results, do a PubMed search for the linked publications
/// (constructed from the results) and add each of them to the database.
///
/// Database to write to
/// Name of the related table
/// Publication filter for filtering results
/// Name of the extreme relavance table in the database
/// Dictionary parsed from NCBI search results that maps each queried PMID to a dictionary of related PMIDs and their ranks and scores
/// NCBI search results parsed into a dictionary that maps queried PMIDs to a list of related PMIDs
/// Input queue for marking success or error
/// True if completed, false if cancelled
private bool ProcessSearchResults(Database db, string relatedTableName, PublicationFilter publicationFilter, string extremeRelevanceTableName,
Dictionary> relatedRanks, Dictionary> relatedSearchResults,
InputQueue inputQueue)
{
int count = 0;
PublicationTypes pubTypes = new PublicationTypes(db);
foreach (int authorPublicationPmid in relatedSearchResults.Keys)
{
bool error = false;
if (BackgroundWorker != null && BackgroundWorker.CancellationPending)
{
Trace.WriteLine(DateTime.Now + " - cancelled");
return false;
}
// Read the author publication from the database -- skipping MeSH headings and grants because we don't use them
Publication authorPublication;
bool retrievedPublication;
try
{
retrievedPublication = Publications.GetPublication(db, authorPublicationPmid, out authorPublication, true);
}
catch (Exception ex)
{
Trace.WriteLine(DateTime.Now + " - " + ex.Message);
retrievedPublication = false;
authorPublication = new Publication();
}
if (!retrievedPublication)
{
Trace.WriteLine(DateTime.Now + " - unable to read publication " + authorPublicationPmid + " from the database");
inputQueue.MarkError(authorPublicationPmid);
continue;
}
// Only write this article's related publications to the database if they haven't already been added.
// (Multiple authors might link to the same publication, and each will add the same links.)
int relatedCount = db.GetIntValue("SELECT Count(*) FROM " + relatedTableName + " WHERE PMID = (?)",
new System.Collections.ArrayList() { Database.Parameter(authorPublicationPmid) });
if (relatedCount != 0)
{
Trace.WriteLine(DateTime.Now + " - [" + ++count + "/" + relatedSearchResults.Keys.Count + "] database already contains related articles for " + authorPublicationPmid);
}
else
{
// Get the list of related PMIDs and their ranks from the search results
List relatedPmids = relatedSearchResults[authorPublicationPmid];
Dictionary relatedRank;
if (relatedRanks.ContainsKey(authorPublicationPmid))
relatedRank = relatedRanks[authorPublicationPmid];
else
relatedRank = new Dictionary();
Trace.WriteLine(DateTime.Now + " - [" + ++count + "/" + relatedSearchResults.Keys.Count + "] adding " + relatedPmids.Count + " related articles found for " + authorPublicationPmid);
string searchResults = SearchPubMedForRelatedPublications(relatedPmids);
int publicationsWritten = 0;
int publicationsExcluded = 0;
int publicationsNullAuthors = 0;
// Track the most relevant publication (eg. the one with the highest score) so it can be added to relatedpublications_extremerelevance
Publication? mostRelevantPublication = null;
int mostRelevantPublicationScore = int.MinValue;
// Track the least relevant publication (eg. the one with the highest score) for relatedpublications_leastrelevant and relatedpubliactions_leastrelevantscore
Publication? leastRelevantPublication = null;
int leastRelevantPublicationScore = int.MaxValue;
int leastRelevantPublicationRank = 0;
// Write each publication to the database
Publications publications = new Publications(searchResults, pubTypes);
if (publications.PublicationList != null) foreach (Publication relatedPublication in publications.PublicationList)
{
if (BackgroundWorker != null && BackgroundWorker.CancellationPending)
{
Trace.WriteLine(DateTime.Now + " - cancelled");
return false;
}
int rank;
int score;
if (relatedRank.ContainsKey(relatedPublication.PMID))
{
rank = relatedRank[relatedPublication.PMID].Rank;
score = relatedRank[relatedPublication.PMID].Score;
}
else
{
rank = -1;
score = -1;
Trace.WriteLine(DateTime.Now + " - publication " + authorPublicationPmid + " could not find rank for related " + relatedPublication.PMID);
}
// A small number of publications come back with a null set of authors, which the database schema doesn't support
if (relatedPublication.Authors == null)
{
publicationsNullAuthors++;
Trace.WriteLine(DateTime.Now + " - publication " + authorPublicationPmid + ": found related publication " + relatedPublication.PMID + " with no author list");
}
// Use the publication filter to include only publications that match the filter
if (publicationFilter.FilterPublication(relatedPublication, rank, authorPublication, pubTypes))
{
// Add the publication to the publications table
// (this will only add it if it's not already there)
Publications.WriteToDB(relatedPublication, db, pubTypes, null);
bool success = WriteRelatedRankToDatabase(db, relatedTableName, authorPublicationPmid, relatedPublication.PMID, rank, score);
if (success)
publicationsWritten++;
else
error = true;
}
else
{
publicationsExcluded++;
}
// We're keeping track of the score of the most relevant pub (even when it is filtered out).
if (!mostRelevantPublication.HasValue || score > mostRelevantPublicationScore)
{
mostRelevantPublication = relatedPublication;
mostRelevantPublicationScore = score;
}
// We're keeping track of the score of the least relevant pub too.
if (!leastRelevantPublication.HasValue || score < leastRelevantPublicationScore)
{
leastRelevantPublication = relatedPublication;
leastRelevantPublicationScore = score;
leastRelevantPublicationRank = rank;
}
}
// Write the most and least relevant pmid/relatedPmid pairs to the _extremerelevance table (if found).
if (mostRelevantPublication.HasValue && leastRelevantPublication.HasValue)
{
try
{
db.ExecuteNonQuery(
"INSERT INTO " + extremeRelevanceTableName + " (PMID, MostRelevantPMID, MostRelevantScore, LeastRelevantPMID, LeastRelevantScore, LeastRelevantRank) VALUES (?, ?, ?, ?, ?, ?)",
new System.Collections.ArrayList() {
Database.Parameter(authorPublicationPmid),
Database.Parameter(mostRelevantPublication.Value.PMID),
Database.Parameter(mostRelevantPublicationScore),
Database.Parameter(leastRelevantPublication.Value.PMID),
Database.Parameter(leastRelevantPublicationScore),
Database.Parameter(leastRelevantPublicationRank)
});
}
catch (Exception ex)
{
Trace.WriteLine(DateTime.Now + " - " +
String.Format("Error writing {0}/{1}/{2} to {3}: {4}",
authorPublicationPmid, mostRelevantPublication.Value.PMID, leastRelevantPublication.Value.PMID, extremeRelevanceTableName, ex.Message));
error = true;
}
}
Trace.WriteLine(DateTime.Now + " - " +
String.Format("Wrote {0}, excluded {1}{2}", publicationsWritten, publicationsExcluded,
publicationsNullAuthors == 0 ? String.Empty : ", " + publicationsNullAuthors + " had no author list"));
}
if (!error)
inputQueue.MarkProcessed(authorPublicationPmid);
else
inputQueue.MarkError(authorPublicationPmid);
}
return true;
}
private static bool WriteRelatedRankToDatabase(Database db, string relatedTableName, int authorPublicationPmid,
int relatedPublicationPmid, int rank, int score)
{
try
{
// Write the pmid/relatedPmid pair to the related publications table.
db.ExecuteNonQuery(
"INSERT INTO " + relatedTableName + " (PMID, RelatedPMID, Rank, Score) VALUES (?, ?, ?, ?)",
new System.Collections.ArrayList() {
Database.Parameter(authorPublicationPmid),
Database.Parameter(relatedPublicationPmid),
Database.Parameter(rank),
Database.Parameter(score),
});
return true;
}
catch (Exception ex)
{
Trace.WriteLine("Unable to add related article " + relatedPublicationPmid + ", error message follows");
Trace.WriteLine(ex.Message);
return false;
}
}
///
/// Search PubMed for all of the related publications and add them to the database, keep trying until search is successful
///
/// Related publications
/// NCBI search results
private string SearchPubMedForRelatedPublications(List relatedPmids)
{
StringBuilder searchQuery = new StringBuilder();
foreach (int relatedPmid in relatedPmids)
{
// Build the search query to issue the PubMed search for related IDs
searchQuery.AppendFormat("{0}{1}[uid]", searchQuery.Length == 0 ? String.Empty : " OR ", relatedPmid);
}
NCBI.UsePostRequest = true;
// If ncbi.Search() throws an exception, retry -- web connection may be temporarily down
bool searchSuccessful = false;
string searchResults = null;
while (!searchSuccessful)
{
try
{
searchResults = ncbi.Search(searchQuery.ToString());
searchSuccessful = true;
}
catch (Exception ex)
{
Trace.WriteLine(DateTime.Now + " - web request error during NCBI search, retrying search. Error message: " + ex.Message);
System.Threading.Thread.Sleep(2000);
}
}
return searchResults;
}
///
/// Create the related publications table and its PeoplePublications view
///
/// Name of the talbe to create
/// Name of the _queue table created
/// Name of the _extremerelevance table created
/// In "lite" mode, only create the related publications table, not the other tables
private static void CreateTables(Database db, string relatedTableName, string queueTableName, string extremeRelevanceTableName, bool liteMode)
{
// Create the related table -- for both regular and "lite" modes
db.ExecuteNonQuery("DROP TABLE IF EXISTS " + relatedTableName);
db.ExecuteNonQuery("CREATE TABLE " + relatedTableName + @" (
PMID int(11) NOT NULL,
RelatedPMID int(11) NOT NULL,
Rank int NOT NULL,
Score int NOT NULL,
PRIMARY KEY (PMID, RelatedPMID)
) ENGINE=MyISAM DEFAULT CHARSET=utf8;
");
// Create the queue -- for both regular and "lite" modes
db.ExecuteNonQuery("DROP TABLE IF EXISTS " + queueTableName);
db.ExecuteNonQuery("CREATE TABLE " + queueTableName + @" (
Setnb char(8) NOT NULL,
PMID int(11) NOT NULL,
Processed bit(1) default 0 NOT NULL,
Error bit(1) default 0 NOT NULL,
PRIMARY KEY (Setnb, PMID)
) ENGINE=MyISAM DEFAULT CHARSET=utf8;
");
if (!liteMode)
{
// Create the view (table name + "_peoplepublications")
db.ExecuteNonQuery("CREATE OR REPLACE VIEW " + relatedTableName + @"_peoplepublications AS
SELECT p.Setnb, rp.RelatedPMID AS PMID, -1 AS AuthorPosition, 6 AS PositionType
FROM people p, peoplepublications pp, relatedpublications rp
WHERE p.Setnb = pp.Setnb
AND pp.PMID = rp.PMID;
");
// Create the most/least relevant publications table (table name + "_extremerelevance")
db.ExecuteNonQuery("DROP TABLE IF EXISTS " + extremeRelevanceTableName);
db.ExecuteNonQuery("CREATE TABLE " + extremeRelevanceTableName + @" (
PMID int(11) NOT NULL,
MostRelevantPMID int(11) NOT NULL,
MostRelevantScore int NOT NULL,
LeastRelevantPMID int(11) NOT NULL,
LeastRelevantScore int NOT NULL,
LeastRelevantRank int NOT NULL,
PRIMARY KEY (PMID, MostRelevantPMID)
) ENGINE=MyISAM DEFAULT CHARSET=utf8;
");
}
}
///
/// Use the NCBI Elink request to retrieve related IDs for one or more publication IDs
///
/// IDs to retrieve
/// Optional minimum date
/// Optional maximum date
/// A string with XML results from elink.fcgi
private static string ExecuteRelatedSearch(IEnumerable ids, string mindate = null, string maxdate = null)
{
if (ids == null)
throw new ArgumentNullException("ids");
StringBuilder query = new StringBuilder();
query.AppendFormat("dbfrom={0}&db={1}&id=", ELINK_DBFROM, ELINK_DB);
bool first = true;
foreach (int id in ids)
{
if (!first)
query.Append("&id=");
else
first = false;
query.Append(id);
}
if (!string.IsNullOrEmpty(mindate))
query.AppendFormat("&mindate={0}", mindate);
if (!string.IsNullOrEmpty(maxdate))
query.AppendFormat("&mindate={0}", maxdate);
// Add "&cmd=neighbor_score" to get the elements
query.Append("&cmd=neighbor_score");
WebRequest request = WebRequest.Create(ELINK_URL);
request.Method = "POST";
request.ContentType = "application/x-www-form-urlencoded";
byte[] byteArray = UTF8Encoding.UTF8.GetBytes(query.ToString());
request.ContentLength = byteArray.Length;
using (Stream dataStream = request.GetRequestStream())
{
dataStream.Write(byteArray, 0, byteArray.Length);
}
using (WebResponse response = request.GetResponse())
using (Stream responseStream = response.GetResponseStream())
using (StreamReader reader = new StreamReader(responseStream))
{
string result = reader.ReadToEnd();
return result;
}
}
///
/// Retrieve the IDs from the XML results from ELink
///
/// XML results from ELink
/// Ouput - dictionary that maps PMIDs to map for looking up rank in related results
/// Dictionary that maps source PMIds to a list of IDs extracted from the XML (or an empty list of none)
private static Dictionary> GetIdsFromXml(string xml, out Dictionary> relatedRanks)
{
Dictionary> result = new Dictionary>();
relatedRanks = new Dictionary>();
List ids = new List();
XmlDocument xmlDoc = new XmlDocument();
xmlDoc.LoadXml(xml);
XmlNodeList linkSets = xmlDoc["eLinkResult"].ChildNodes;
foreach (XmlNode linkSet in linkSets)
{
// There's one for each PMID from the search
int pmid;
if (int.TryParse(linkSet["IdList"]["Id"].InnerText, out pmid)) {
Dictionary relatedRank = null;
if (relatedRanks.ContainsKey(pmid))
relatedRank = relatedRanks[pmid];
else
{
relatedRank = new Dictionary();
relatedRanks.Add(pmid, relatedRank);
}
XmlNodeList linkSetDbs = linkSet.SelectNodes("LinkSetDb");
// Find the "pubmed_pubmed" link set
foreach (XmlNode linkSetDb in linkSetDbs)
{
if (linkSetDb["LinkName"].InnerText == "pubmed_pubmed")
{
// We've found the link set of related PubMed publications. Add it to the results.
List linkList;
if (result.ContainsKey(pmid))
linkList = result[pmid] as List;
else
{
linkList = new List();
result[pmid] = linkList;
}
int rank = 0;
foreach (XmlNode link in linkSetDb.SelectNodes("Link"))
{
int score;
if (!int.TryParse(link["Score"].InnerText, out score))
{
score = -1;
}
int relatedPmid;
if (int.TryParse(link["Id"].InnerText, out relatedPmid))
{
linkList.Add(relatedPmid);
RankAndScore rankAndScore = new RankAndScore() { Rank = ++rank, Score = score };
relatedRank.Add(relatedPmid, rankAndScore);
}
}
}
}
}
}
return result;
}
}
}