/* * FindRelated * Copyright (c) 2003-2011 Stellman & Greene Consulting * Developed for Joshua Zivin and Pierre Azoulay, Columbia University * http://www.stellman-greene.com/PublicationHarvester * * This program is free software; you can redistribute it and/or modify it under * the terms of the GNU General Public License as published by the Free Software * Foundation; either version 2 of the License, or (at your option) any later * version. * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along with * this program (GPL.txt); if not, write to the Free Software Foundation, Inc., 51 * Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ using System; using System.Collections.Generic; using System.Text; using System.Xml; using System.Net; using System.IO; using System.Data; using System.Diagnostics; using Com.StellmanGreene.PubMed; namespace Com.StellmanGreene.FindRelated { internal class RelatedFinder { struct RankAndScore { public int Rank { get; set; } public int Score { get; set; } } const string ELINK_URL = "http://www.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi"; const string ELINK_DB = "pubmed"; const string ELINK_DBFROM = "pubmed"; private NCBI ncbi = new NCBI("medline"); public System.ComponentModel.BackgroundWorker BackgroundWorker { get; set; } /// /// Execute the FindRelated search, create and populate the tables /// /// ODBC DSN to access the SQL server /// Name of the FindRelated SQL table to create /// FileInfo object with information about the input CSV file /// PublicationFilter object to use for filtering publications /// True if resuming a previous run /// True if in "lite" mode, where it only runs the FindRelated search and does not do additional processing /// Output filename for "lite" mode (ignored when not in "lite" mode) public void Go(string odbcDsn, string relatedTableName, FileInfo inputFileInfo, PublicationFilter publicationFilter, bool resume, bool liteMode, string liteModeOutputFile) { Database db = new Database(odbcDsn); string queueTableName = relatedTableName + "_queue"; string extremeRelevanceTableName = relatedTableName + "_extremerelevance"; InputQueue inputQueue; if (!resume) { if (liteMode && !CreateLiteModeOutputFile(liteModeOutputFile)) return; CreateTables(db, relatedTableName, queueTableName, extremeRelevanceTableName, liteMode); inputQueue = new InputQueue(inputFileInfo, db, queueTableName); } else { inputQueue = new InputQueue(db, queueTableName); } int setnbCount = 0; while (inputQueue.Next()) { BackgroundWorker.ReportProgress((100 * setnbCount) / inputQueue.Count); Trace.WriteLine(DateTime.Now + " - querying for related articles for setnb " + inputQueue.CurrentSetnb + " (" + ++setnbCount + " of " + inputQueue.Count + ")"); // Do the linked publication search for the author's PMIDs and process the results. // This returns a Dictionary that maps author publications (from the PeoplePublications table) // to linked publications, so each key is one of the author publications read from the DB originally. string xml = ExecuteRelatedSearch(inputQueue.CurrentPmids); Dictionary> relatedRanks; Dictionary> relatedSearchResults = GetIdsFromXml(xml, out relatedRanks); bool completed; if (liteMode) { Trace.WriteLine(DateTime.Now + " - found " + relatedSearchResults.Count + " PMIDs for setnb " + inputQueue.CurrentSetnb); completed = false; completed = WriteRelatedRanksToOutputFileAndDatabaseForLiteMode(db, relatedTableName, relatedSearchResults, relatedRanks, liteModeOutputFile, inputQueue); if (!completed) // WriteRelatedRankToOutputFile() returns false if the user cancelled the operation break; } else { int total = 0; foreach (int key in relatedSearchResults.Keys) total += relatedSearchResults[key].Count; Trace.WriteLine(DateTime.Now + " - found " + total + " related to " + relatedSearchResults.Keys.Count + " publications"); completed = ProcessSearchResults(db, relatedTableName, publicationFilter, extremeRelevanceTableName, relatedRanks, relatedSearchResults, inputQueue); if (!completed) // ProcessSearchResults() returns false if the user cancelled the operation break; } } BackgroundWorker.ReportProgress(100); } /// /// Create the "lite" mode output file /// /// private bool CreateLiteModeOutputFile(string liteModeOutputFile) { try { if (File.Exists(liteModeOutputFile)) { string outputFileName = Path.GetFileName(liteModeOutputFile); if (File.Exists(liteModeOutputFile + ".bak")) { Trace.WriteLine(DateTime.Now + " - deleting old \"lite\" mode output .bak file '" + outputFileName + ".bak'"); File.Delete(liteModeOutputFile + ".bak"); } Trace.WriteLine(DateTime.Now + " - renaming old \"lite\" mode output file '" + outputFileName + "' to '" + outputFileName + ".bak'"); File.Move(liteModeOutputFile, liteModeOutputFile + ".bak"); } string header = "pmid,rltd_pmid,rltd_rank,rltd_score" + Environment.NewLine; File.WriteAllText(liteModeOutputFile, header); } catch (Exception ex) { Trace.WriteLine(DateTime.Now + " - unable to create the \"lite\" mode output file: " + ex.Message); Trace.WriteLine(ex.StackTrace); } return true; } /// /// Go through all of the ranks and scores retrieved from the server for each PMID and write them to the output file and the database. /// This is used by the 'lite' mode. /// /// Database to write to /// Name of the related table /// NCBI search results parsed into a dictionary that maps queried PMIDs to a list of related PMIDs /// Dictionary parsed from NCBI search results that maps each queried PMID to a dictionary of related PMIDs and their ranks and scores /// Output file to append to /// Input queue for marking success or error /// True if a lines were successfully added to the file and table, false if an error occurred private bool WriteRelatedRanksToOutputFileAndDatabaseForLiteMode(Database db, string relatedTableName, Dictionary> relatedSearchResults, Dictionary> relatedRanks, string liteModeOutputFile, InputQueue inputQueue) { if (BackgroundWorker != null && BackgroundWorker.CancellationPending) { Trace.WriteLine(DateTime.Now + " - cancelled"); return false; } foreach (int pmid in relatedSearchResults.Keys) { List relatedPmids = relatedSearchResults[pmid]; if (relatedPmids == null) Trace.WriteLine(DateTime.Now + " - found empty related PMID list for PMID " + pmid); else if (!relatedRanks.ContainsKey(pmid)) Trace.WriteLine(DateTime.Now + " - no ranks or scores found for PMID " + pmid); else { Dictionary ranksAndScores = relatedRanks[pmid]; foreach (int relatedPmid in relatedPmids) { if (!ranksAndScores.ContainsKey(relatedPmid)) Trace.WriteLine(DateTime.Now + " - unable to find related ranks and scores for PMID " + pmid + ", related PMID " + relatedPmid); else { RankAndScore rankAndScore = ranksAndScores[relatedPmid]; string line = String.Format("{0},{1},{2},{3}", pmid, relatedPmid, rankAndScore.Rank, rankAndScore.Score); string output = line + Environment.NewLine; try { File.AppendAllText(liteModeOutputFile, output); } catch (Exception ex) { Trace.WriteLine(DateTime.Now + " - unable to append '" + line + "' to the \"lite\" mode output file: " + ex.Message); Trace.WriteLine(ex.StackTrace); Trace.WriteLine(DateTime.Now + " - cancelling the run, use the Resume button to resume"); inputQueue.MarkError(pmid); return false; } bool written = WriteRelatedRankToDatabase(db, relatedTableName, pmid, relatedPmid, rankAndScore.Rank, rankAndScore.Score); if (!written) return false; } } } // Mark the PMID processed in the queue inputQueue.MarkProcessed(pmid); } return true; } /// /// For each of the author's publications in the results, do a PubMed search for the linked publications /// (constructed from the results) and add each of them to the database. /// /// Database to write to /// Name of the related table /// Publication filter for filtering results /// Name of the extreme relavance table in the database /// Dictionary parsed from NCBI search results that maps each queried PMID to a dictionary of related PMIDs and their ranks and scores /// NCBI search results parsed into a dictionary that maps queried PMIDs to a list of related PMIDs /// Input queue for marking success or error /// True if completed, false if cancelled private bool ProcessSearchResults(Database db, string relatedTableName, PublicationFilter publicationFilter, string extremeRelevanceTableName, Dictionary> relatedRanks, Dictionary> relatedSearchResults, InputQueue inputQueue) { int count = 0; PublicationTypes pubTypes = new PublicationTypes(db); foreach (int authorPublicationPmid in relatedSearchResults.Keys) { bool error = false; if (BackgroundWorker != null && BackgroundWorker.CancellationPending) { Trace.WriteLine(DateTime.Now + " - cancelled"); return false; } // Read the author publication from the database -- skipping MeSH headings and grants because we don't use them Publication authorPublication; bool retrievedPublication; try { retrievedPublication = Publications.GetPublication(db, authorPublicationPmid, out authorPublication, true); } catch (Exception ex) { Trace.WriteLine(DateTime.Now + " - " + ex.Message); retrievedPublication = false; authorPublication = new Publication(); } if (!retrievedPublication) { Trace.WriteLine(DateTime.Now + " - unable to read publication " + authorPublicationPmid + " from the database"); inputQueue.MarkError(authorPublicationPmid); continue; } // Only write this article's related publications to the database if they haven't already been added. // (Multiple authors might link to the same publication, and each will add the same links.) int relatedCount = db.GetIntValue("SELECT Count(*) FROM " + relatedTableName + " WHERE PMID = (?)", new System.Collections.ArrayList() { Database.Parameter(authorPublicationPmid) }); if (relatedCount != 0) { Trace.WriteLine(DateTime.Now + " - [" + ++count + "/" + relatedSearchResults.Keys.Count + "] database already contains related articles for " + authorPublicationPmid); } else { // Get the list of related PMIDs and their ranks from the search results List relatedPmids = relatedSearchResults[authorPublicationPmid]; Dictionary relatedRank; if (relatedRanks.ContainsKey(authorPublicationPmid)) relatedRank = relatedRanks[authorPublicationPmid]; else relatedRank = new Dictionary(); Trace.WriteLine(DateTime.Now + " - [" + ++count + "/" + relatedSearchResults.Keys.Count + "] adding " + relatedPmids.Count + " related articles found for " + authorPublicationPmid); string searchResults = SearchPubMedForRelatedPublications(relatedPmids); int publicationsWritten = 0; int publicationsExcluded = 0; int publicationsNullAuthors = 0; // Track the most relevant publication (eg. the one with the highest score) so it can be added to relatedpublications_extremerelevance Publication? mostRelevantPublication = null; int mostRelevantPublicationScore = int.MinValue; // Track the least relevant publication (eg. the one with the highest score) for relatedpublications_leastrelevant and relatedpubliactions_leastrelevantscore Publication? leastRelevantPublication = null; int leastRelevantPublicationScore = int.MaxValue; int leastRelevantPublicationRank = 0; // Write each publication to the database Publications publications = new Publications(searchResults, pubTypes); if (publications.PublicationList != null) foreach (Publication relatedPublication in publications.PublicationList) { if (BackgroundWorker != null && BackgroundWorker.CancellationPending) { Trace.WriteLine(DateTime.Now + " - cancelled"); return false; } int rank; int score; if (relatedRank.ContainsKey(relatedPublication.PMID)) { rank = relatedRank[relatedPublication.PMID].Rank; score = relatedRank[relatedPublication.PMID].Score; } else { rank = -1; score = -1; Trace.WriteLine(DateTime.Now + " - publication " + authorPublicationPmid + " could not find rank for related " + relatedPublication.PMID); } // A small number of publications come back with a null set of authors, which the database schema doesn't support if (relatedPublication.Authors == null) { publicationsNullAuthors++; Trace.WriteLine(DateTime.Now + " - publication " + authorPublicationPmid + ": found related publication " + relatedPublication.PMID + " with no author list"); } // Use the publication filter to include only publications that match the filter if (publicationFilter.FilterPublication(relatedPublication, rank, authorPublication, pubTypes)) { // Add the publication to the publications table // (this will only add it if it's not already there) Publications.WriteToDB(relatedPublication, db, pubTypes, null); bool success = WriteRelatedRankToDatabase(db, relatedTableName, authorPublicationPmid, relatedPublication.PMID, rank, score); if (success) publicationsWritten++; else error = true; } else { publicationsExcluded++; } // We're keeping track of the score of the most relevant pub (even when it is filtered out). if (!mostRelevantPublication.HasValue || score > mostRelevantPublicationScore) { mostRelevantPublication = relatedPublication; mostRelevantPublicationScore = score; } // We're keeping track of the score of the least relevant pub too. if (!leastRelevantPublication.HasValue || score < leastRelevantPublicationScore) { leastRelevantPublication = relatedPublication; leastRelevantPublicationScore = score; leastRelevantPublicationRank = rank; } } // Write the most and least relevant pmid/relatedPmid pairs to the _extremerelevance table (if found). if (mostRelevantPublication.HasValue && leastRelevantPublication.HasValue) { try { db.ExecuteNonQuery( "INSERT INTO " + extremeRelevanceTableName + " (PMID, MostRelevantPMID, MostRelevantScore, LeastRelevantPMID, LeastRelevantScore, LeastRelevantRank) VALUES (?, ?, ?, ?, ?, ?)", new System.Collections.ArrayList() { Database.Parameter(authorPublicationPmid), Database.Parameter(mostRelevantPublication.Value.PMID), Database.Parameter(mostRelevantPublicationScore), Database.Parameter(leastRelevantPublication.Value.PMID), Database.Parameter(leastRelevantPublicationScore), Database.Parameter(leastRelevantPublicationRank) }); } catch (Exception ex) { Trace.WriteLine(DateTime.Now + " - " + String.Format("Error writing {0}/{1}/{2} to {3}: {4}", authorPublicationPmid, mostRelevantPublication.Value.PMID, leastRelevantPublication.Value.PMID, extremeRelevanceTableName, ex.Message)); error = true; } } Trace.WriteLine(DateTime.Now + " - " + String.Format("Wrote {0}, excluded {1}{2}", publicationsWritten, publicationsExcluded, publicationsNullAuthors == 0 ? String.Empty : ", " + publicationsNullAuthors + " had no author list")); } if (!error) inputQueue.MarkProcessed(authorPublicationPmid); else inputQueue.MarkError(authorPublicationPmid); } return true; } private static bool WriteRelatedRankToDatabase(Database db, string relatedTableName, int authorPublicationPmid, int relatedPublicationPmid, int rank, int score) { try { // Write the pmid/relatedPmid pair to the related publications table. db.ExecuteNonQuery( "INSERT INTO " + relatedTableName + " (PMID, RelatedPMID, Rank, Score) VALUES (?, ?, ?, ?)", new System.Collections.ArrayList() { Database.Parameter(authorPublicationPmid), Database.Parameter(relatedPublicationPmid), Database.Parameter(rank), Database.Parameter(score), }); return true; } catch (Exception ex) { Trace.WriteLine("Unable to add related article " + relatedPublicationPmid + ", error message follows"); Trace.WriteLine(ex.Message); return false; } } /// /// Search PubMed for all of the related publications and add them to the database, keep trying until search is successful /// /// Related publications /// NCBI search results private string SearchPubMedForRelatedPublications(List relatedPmids) { StringBuilder searchQuery = new StringBuilder(); foreach (int relatedPmid in relatedPmids) { // Build the search query to issue the PubMed search for related IDs searchQuery.AppendFormat("{0}{1}[uid]", searchQuery.Length == 0 ? String.Empty : " OR ", relatedPmid); } NCBI.UsePostRequest = true; // If ncbi.Search() throws an exception, retry -- web connection may be temporarily down bool searchSuccessful = false; string searchResults = null; while (!searchSuccessful) { try { searchResults = ncbi.Search(searchQuery.ToString()); searchSuccessful = true; } catch (Exception ex) { Trace.WriteLine(DateTime.Now + " - web request error during NCBI search, retrying search. Error message: " + ex.Message); System.Threading.Thread.Sleep(2000); } } return searchResults; } /// /// Create the related publications table and its PeoplePublications view /// /// Name of the talbe to create /// Name of the _queue table created /// Name of the _extremerelevance table created /// In "lite" mode, only create the related publications table, not the other tables private static void CreateTables(Database db, string relatedTableName, string queueTableName, string extremeRelevanceTableName, bool liteMode) { // Create the related table -- for both regular and "lite" modes db.ExecuteNonQuery("DROP TABLE IF EXISTS " + relatedTableName); db.ExecuteNonQuery("CREATE TABLE " + relatedTableName + @" ( PMID int(11) NOT NULL, RelatedPMID int(11) NOT NULL, Rank int NOT NULL, Score int NOT NULL, PRIMARY KEY (PMID, RelatedPMID) ) ENGINE=MyISAM DEFAULT CHARSET=utf8; "); // Create the queue -- for both regular and "lite" modes db.ExecuteNonQuery("DROP TABLE IF EXISTS " + queueTableName); db.ExecuteNonQuery("CREATE TABLE " + queueTableName + @" ( Setnb char(8) NOT NULL, PMID int(11) NOT NULL, Processed bit(1) default 0 NOT NULL, Error bit(1) default 0 NOT NULL, PRIMARY KEY (Setnb, PMID) ) ENGINE=MyISAM DEFAULT CHARSET=utf8; "); if (!liteMode) { // Create the view (table name + "_peoplepublications") db.ExecuteNonQuery("CREATE OR REPLACE VIEW " + relatedTableName + @"_peoplepublications AS SELECT p.Setnb, rp.RelatedPMID AS PMID, -1 AS AuthorPosition, 6 AS PositionType FROM people p, peoplepublications pp, relatedpublications rp WHERE p.Setnb = pp.Setnb AND pp.PMID = rp.PMID; "); // Create the most/least relevant publications table (table name + "_extremerelevance") db.ExecuteNonQuery("DROP TABLE IF EXISTS " + extremeRelevanceTableName); db.ExecuteNonQuery("CREATE TABLE " + extremeRelevanceTableName + @" ( PMID int(11) NOT NULL, MostRelevantPMID int(11) NOT NULL, MostRelevantScore int NOT NULL, LeastRelevantPMID int(11) NOT NULL, LeastRelevantScore int NOT NULL, LeastRelevantRank int NOT NULL, PRIMARY KEY (PMID, MostRelevantPMID) ) ENGINE=MyISAM DEFAULT CHARSET=utf8; "); } } /// /// Use the NCBI Elink request to retrieve related IDs for one or more publication IDs /// /// IDs to retrieve /// Optional minimum date /// Optional maximum date /// A string with XML results from elink.fcgi private static string ExecuteRelatedSearch(IEnumerable ids, string mindate = null, string maxdate = null) { if (ids == null) throw new ArgumentNullException("ids"); StringBuilder query = new StringBuilder(); query.AppendFormat("dbfrom={0}&db={1}&id=", ELINK_DBFROM, ELINK_DB); bool first = true; foreach (int id in ids) { if (!first) query.Append("&id="); else first = false; query.Append(id); } if (!string.IsNullOrEmpty(mindate)) query.AppendFormat("&mindate={0}", mindate); if (!string.IsNullOrEmpty(maxdate)) query.AppendFormat("&mindate={0}", maxdate); // Add "&cmd=neighbor_score" to get the elements query.Append("&cmd=neighbor_score"); WebRequest request = WebRequest.Create(ELINK_URL); request.Method = "POST"; request.ContentType = "application/x-www-form-urlencoded"; byte[] byteArray = UTF8Encoding.UTF8.GetBytes(query.ToString()); request.ContentLength = byteArray.Length; using (Stream dataStream = request.GetRequestStream()) { dataStream.Write(byteArray, 0, byteArray.Length); } using (WebResponse response = request.GetResponse()) using (Stream responseStream = response.GetResponseStream()) using (StreamReader reader = new StreamReader(responseStream)) { string result = reader.ReadToEnd(); return result; } } /// /// Retrieve the IDs from the XML results from ELink /// /// XML results from ELink /// Ouput - dictionary that maps PMIDs to map for looking up rank in related results /// Dictionary that maps source PMIds to a list of IDs extracted from the XML (or an empty list of none) private static Dictionary> GetIdsFromXml(string xml, out Dictionary> relatedRanks) { Dictionary> result = new Dictionary>(); relatedRanks = new Dictionary>(); List ids = new List(); XmlDocument xmlDoc = new XmlDocument(); xmlDoc.LoadXml(xml); XmlNodeList linkSets = xmlDoc["eLinkResult"].ChildNodes; foreach (XmlNode linkSet in linkSets) { // There's one for each PMID from the search int pmid; if (int.TryParse(linkSet["IdList"]["Id"].InnerText, out pmid)) { Dictionary relatedRank = null; if (relatedRanks.ContainsKey(pmid)) relatedRank = relatedRanks[pmid]; else { relatedRank = new Dictionary(); relatedRanks.Add(pmid, relatedRank); } XmlNodeList linkSetDbs = linkSet.SelectNodes("LinkSetDb"); // Find the "pubmed_pubmed" link set foreach (XmlNode linkSetDb in linkSetDbs) { if (linkSetDb["LinkName"].InnerText == "pubmed_pubmed") { // We've found the link set of related PubMed publications. Add it to the results. List linkList; if (result.ContainsKey(pmid)) linkList = result[pmid] as List; else { linkList = new List(); result[pmid] = linkList; } int rank = 0; foreach (XmlNode link in linkSetDb.SelectNodes("Link")) { int score; if (!int.TryParse(link["Score"].InnerText, out score)) { score = -1; } int relatedPmid; if (int.TryParse(link["Id"].InnerText, out relatedPmid)) { linkList.Add(relatedPmid); RankAndScore rankAndScore = new RankAndScore() { Rank = ++rank, Score = score }; relatedRank.Add(relatedPmid, rankAndScore); } } } } } } return result; } } }