/*
* Publication Harvester
* Copyright (c) 2003-2006 Stellman & Greene Consulting
* Developed for Joshua Zivin and Pierre Azoulay, Columbia University
* http://www.stellman-greene.com/PublicationHarvester
*
* This program is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
* Foundation; either version 2 of the License, or (at your option) any later
* version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with
* this program (GPL.txt); if not, write to the Free Software Foundation, Inc., 51
* Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
using System;
using System.Collections.Generic;
using System.Text;
using System.Collections;
using System.Data;
using System.IO;
namespace Com.StellmanGreene.PubMed
{
public class Reports
{
///
/// Table that contains the people to read (usually "People")
///
public string PeopleTable = "People";
///
/// Table contains the publications for the people
///
public string PeoplePublicationsTable = "PeoplePublications";
///
/// The contents of the journal weights file
///
public Hashtable Weights;
///
/// The PublicationTypes object passed in the constructor
///
public PublicationTypes PubTypes;
///
/// The Database object passed in the constructor
///
public Database DB;
///
/// Initialize the journal weights file
///
/// Filename of the journal weights file
public Reports(Database DB, string JournalWeightsFilename)
{
PeopleReportSections = DefaultPeopleReportSections();
string[] Columns = { "JOURNAL TITLE", "JIF" };
DataTable Results = People.ReadExcelFile(
Path.GetDirectoryName(JournalWeightsFilename),
Path.GetFileName(JournalWeightsFilename),
Columns);
// Populate the Weights hash table (which was declared to be case-insensitive)
Weights = new Hashtable(StringComparer.CurrentCultureIgnoreCase);
foreach (DataRow Row in Results.Rows)
{
if (!Weights.ContainsKey(Row["JOURNAL TITLE"].ToString()))
Weights.Add(Row["JOURNAL TITLE"].ToString(), Convert.ToSingle(Row["JIF"]));
}
this.PubTypes = new PublicationTypes(DB);
this.DB = DB;
}
///
/// Class to contain the counts for an author
///
private class Counts
{
public int First;
public float FirstWeighted;
public int Last;
public float LastWeighted;
public int Middle;
public float MiddleWeighted;
public int NextToLast;
public float NextToLastWeighted;
public int Second;
public float SecondWeighted;
///
/// Retrieve counts from a publication list
///
/// Publication list to retrieve counts from,
/// sorted by year, publication type and author position
/// Offset in the publication list of the first publication
/// matching the year and publication type
/// Year to match for
/// Publication type to match for
public Counts(Publication[] PublicationList, ref int Index, int Year, int PublicationType,
PublicationTypes PubTypes, Database DB, Person person, Hashtable Weights, string PeoplePublicationsTable)
{
// Return zero counts if the publication list is empty
if (PublicationList.Length == 0)
return;
// Return zero counts if the index is out of bounds
if ((Index < 0) || (Index >= PublicationList.Length))
return;
// Return zero counts if the index doesn't point to a match -- that means
// there are no matches
Publication pub = PublicationList[Index];
int PubType = PubTypes.GetCategoryNumber(pub.PubType);
if ((pub.Year != Year) || (PubType != PublicationType))
return;
// If we get this far, we have a match. Move forward through the publication
// list, adding to the counts, until we find a non-matching publication or
// the list runs out.
do
{
// Get the weight for the journal
float Weight = 0;
if (pub.Journal != null && Weights.ContainsKey(pub.Journal))
{
Weight += (float)Weights[pub.Journal];
}
// Get the position type, and increment the correct counter
Harvester.AuthorPositions PositionType;
Publications.GetAuthorPosition(DB, pub.PMID, person, out PositionType, PeoplePublicationsTable);
switch (PositionType)
{
case Harvester.AuthorPositions.First:
First++;
FirstWeighted += Weight;
break;
case Harvester.AuthorPositions.Last:
Last++;
LastWeighted += Weight;
break;
case Harvester.AuthorPositions.Second:
Second++;
SecondWeighted += Weight;
break;
case Harvester.AuthorPositions.NextToLast:
NextToLast++;
NextToLastWeighted += Weight;
break;
case Harvester.AuthorPositions.Middle:
case Harvester.AuthorPositions.None:
Middle++;
MiddleWeighted += Weight;
break;
}
Index++;
if (Index < PublicationList.Length)
{
pub = PublicationList[Index];
PubType = PubTypes.GetCategoryNumber(pub.PubType);
}
} while ((Index < PublicationList.Length)
&& (PublicationList[Index].Year == Year)
&& (PubType == PublicationType));
}
///
/// Create a Counts() object that's a sum of other counts
///
/// Array of Counts() objects to sum
public Counts(Counts[] CountsToSum)
{
foreach (Counts counts in CountsToSum)
{
First += counts.First;
FirstWeighted += counts.FirstWeighted;
Last += counts.Last;
LastWeighted += counts.LastWeighted;
Middle += counts.Middle;
MiddleWeighted += counts.MiddleWeighted;
NextToLast += counts.NextToLast;
NextToLastWeighted += counts.NextToLastWeighted;
Second += counts.Second;
SecondWeighted += counts.SecondWeighted;
}
}
///
/// Write the six counts to the report row
///
/// Writer to write the counts to
public void WriteCounts(StringWriter writer)
{
// pubcount -- Total nb. of pubs
int Total = First + Last + Second + NextToLast + Middle;
writer.Write("," + Total.ToString());
// wghtd_pubcount -- Weighted total nb. of pubs
float TotalWeighted = FirstWeighted + LastWeighted + SecondWeighted +
NextToLastWeighted + MiddleWeighted;
writer.Write("," + TotalWeighted.ToString());
// pubcount_pos1 -- Total nb. of pubs, 1st author
writer.Write("," + First);
// wghtd_pubcount_pos1 -- Weighted total nb. of pubs, 1st author
writer.Write("," + FirstWeighted);
// pubcount_posN -- Total nb. of pubs, last author
writer.Write("," + Last);
// wghtd_pubcount_posN -- Weighted total nb. of pubs, last author
writer.Write("," + LastWeighted);
// pubcount_posM -- Total nb. of pubs, middle author
writer.Write("," + Middle);
// wghtd_pubcount_posM -- Weighted total nb. of pubs, middle author
writer.Write("," + MiddleWeighted);
// pubcount_posNTL -- Total nb. of pubs, next-to-last author
writer.Write("," + NextToLast);
// wghtd_pubcount_posNTL -- Weighted total nb. of pubs, next-to-last author
writer.Write("," + NextToLastWeighted);
// pubcount_pos2 -- Total nb. of pubs, 2nd author
writer.Write("," + Second);
// wghtd_pubcount_pos2 -- Weighted total nb. of pubs, 2nd author
writer.Write("," + SecondWeighted);
}
}
///
/// Create a row in the People report
///
/// Person to write
/// Publications to use as input
/// Year to write
/// The row in CSV format
public string ReportRow(Person person, Publications Pubs, int Year)
{
// This function has been optimized so that the software only loops through the list of publications
// once. To do this, the list is first sorted in order of year, publication type "bin", author
// position type and PMID. (PMID is only there so that the ordering of the list is easily predictable.)
//
// The function builds one row in the report by constructing an array of values, and then joining
// that array using commas. (There are no strings with commas, so this will be a valid CSV row.)
// The row is divided into sections: a set of columns for each bin, one column per author position.
StringBuilder sb = new StringBuilder();
StringWriter writer = new StringWriter(sb);
// Write the keys
// setnb -- Person identifier
// year -- Year of transition
writer.Write(person.Setnb + ",");
writer.Write(Year.ToString());
// The array has been sorted, so we an search for the year. Note that the
// binary search may not return the first matching index, so we need to rewind.
PublicationYearFinder YearFinder = new PublicationYearFinder();
int Index = Array.BinarySearch(Pubs.PublicationList, Year, YearFinder);
while ((Index > 0) && (Pubs.PublicationList[Index - 1].Year == Year))
Index--;
// Get the counts for each publication type "bin"
// The bins are defined in the PeopleReportSections array, which
// contains either "all", "i+j+k+..+y+z" or "n"
// Query the PublicationTypes table to find all of the pub types,
// and use them to build a Hashtable, indexed by publication type
// category, that contains a Counts() object for that type
Hashtable CategoryCounts = new Hashtable();
DataTable CategoryTable = DB.ExecuteQuery(
@"SELECT DISTINCT PubTypeCategoryID FROM PubTypeCategories
ORDER BY PubTypeCategoryID;"); // Order by Category ID so it doesn't break the optimization
int NumCategories = CategoryTable.Rows.Count;
int[] Categories = new int[NumCategories];
for (int RowNum = 0; RowNum < CategoryTable.Rows.Count; RowNum++)
{
int Category = Convert.ToInt32(CategoryTable.Rows[RowNum]["PubTypeCategoryID"]);
Categories[RowNum] = Category;
CategoryCounts[Category] = new Counts(
Pubs.PublicationList, ref Index, Year, Category,
PubTypes, DB, person, Weights, PeoplePublicationsTable);
}
// For each section in PeopleReportSections, write the appropriate section,
// using the Counts() object that was just calculated and stuck into
// the CategoryCounts hashtable
for (int SectionNum = 0; SectionNum < PeopleReportSections.Length; SectionNum++)
{
string Section = PeopleReportSections[SectionNum];
if (Section == "all")
{
// The section is "all" -- generate a count of all values
Counts[] AllCountObjects = new Counts[NumCategories];
for (int i = 0; i < NumCategories; i++) {
AllCountObjects[i] = (Counts)CategoryCounts[Categories[i]];
}
Counts AllCounts = new Counts(AllCountObjects);
AllCounts.WriteCounts(writer);
}
else if (Section.Contains("+"))
{
// The section contains a list of categories separated with +'s
// This is a sum of categories (like "1+2+3")
string[] SectionSplit = Section.Split('+');
Counts[] SumCountObjects = new Counts[SectionSplit.Length];
for (int i = 0; i < SectionSplit.Length; i++)
{
string OneSection = SectionSplit[i];
if (!Publications.IsNumeric(OneSection))
throw new Exception("ReportSections contains invalid section '" + Section + "'");
int SectionValue = Convert.ToInt32(OneSection);
if (CategoryCounts.ContainsKey(SectionValue))
{
Counts OneBinCounts = (Counts) CategoryCounts[SectionValue];
SumCountObjects[i] = OneBinCounts;
}
else
{
throw new Exception("ReportSections contains invalid section '" + Section + "'");
}
}
Counts SumCounts = new Counts(SumCountObjects);
SumCounts.WriteCounts(writer);
}
else
{
// The section contains a single bin -- generate a Counts object
// and write it out. (Make sure it's a real category!)
if (!Publications.IsNumeric(Section))
throw new Exception("ReportSections contains invalid section '" + Section + "'");
int SectionValue = Convert.ToInt32(Section);
if (CategoryCounts.ContainsKey(SectionValue))
{
Counts SingleBinCounts = (Counts)CategoryCounts[Categories[SectionValue]];
SingleBinCounts.WriteCounts(writer);
}
else
{
throw new Exception("ReportSections contains invalid section '" + Section + "'");
}
}
}
return sb.ToString();
}
///
/// Callback function for the reports to return status
///
/// Number of publications processed
/// Total publications for this person
public delegate void ReportStatus(int number, int total, Person person, bool ProgressBarOnly);
///
/// Callback function for reports to send a message
///
/// Message to send
public delegate void ReportMessage(string Message);
///
/// This array defines the columns that are generated as part of the People report
/// Each array element either contains "all" (for all bins), "i+j+k+...+y+z" (for
/// a sum of many bins), or "n" (for bin #n)
///
public string[] PeopleReportSections;
///
/// This static function returns the default report sections, so the GUI
/// can populate its list properly
///
/// An array of the default People report sections
public static string[] DefaultPeopleReportSections() {
string[] returnValue = { "all", "1+2+3", "1", "2", "3", "4" };
return returnValue;
}
///
/// Add rows to the People report
///
/// Writer to write the CSV rows to
public void PeopleReport(ArrayList SetnbsToSkip, StreamWriter writer, ReportStatus StatusCallback, ReportMessage MessageCallback)
{
// Write the header row -- this must be generated dynamically
// based on the values in PeopleReportSections
// write the keys
writer.Write("setnb,year");
// write a set of column names for each element in PeopleReportSections
for (int i = 0; i < PeopleReportSections.Length; i++)
{
string values = PeopleReportSections[i].ToLower().Trim();
string[] BaseColumnNames = {
"pubcount", "wghtd_pubcount", "pubcount_pos1",
"wghtd_pubcount_pos1", "pubcount_posN", "wghtd_pubcount_posN",
"pubcount_posM", "wghtd_pubcount_posM", "pubcount_posNTL",
"wghtd_pubcount_posNTL", "pubcount_pos2", "wghtd_pubcount_pos2"
};
if (values == "all")
{
// all bins -- use the base column names as-is
writer.Write("," + String.Join(",", BaseColumnNames));
} else
{
// string any +'s from the value type, so "1+2+3" turns into "123"
values = values.Replace("+", "");
// replace pubcount_posM with 123pubcount_posM
// replace wghtd_pubcount_pos1 with wghtd_123pubcount_pos1
for (int j = 0; j < BaseColumnNames.Length; j++)
{
string Column;
if (BaseColumnNames[j].Contains("wghtd_"))
Column = BaseColumnNames[j].Replace("wghtd_", "wghtd_" + values);
else
Column = values + BaseColumnNames[j];
writer.Write("," + Column);
}
}
}
writer.WriteLine();
// Write the row for each person
People people = new People(DB, PeopleTable);
int Total = people.PersonList.Count;
int Number = 0;
foreach (Person person in people.PersonList)
{
Number++;
StatusCallback(Number, Total, person, false);
// Skip the person if the Setnb is in SetnbsToSkip
if ((SetnbsToSkip == null) || (!SetnbsToSkip.Contains(person.Setnb)))
{
// Get the person's publications. If there are no publications for
// the person, this will throw an error.
Publications pubs;
try
{
pubs = new Publications(DB, person, PeoplePublicationsTable, false);
}
catch (Exception ex)
{
MessageCallback(ex.Message);
pubs = null;
}
// Sort the list of publications
if (pubs != null)
{
PublicationComparer Comparer = new PublicationComparer();
Comparer.DB = DB;
Comparer.person = person;
Comparer.publicationTypes = PubTypes;
Array.Sort(pubs.PublicationList, Comparer);
// Find the minimum and maximum years
int YearMinimum = pubs.PublicationList[0].Year;
int YearMaximum = pubs.PublicationList[0].Year;
foreach (Publication pub in pubs.PublicationList)
{
if (pub.Year < YearMinimum)
YearMinimum = pub.Year;
if (pub.Year > YearMaximum)
YearMaximum = pub.Year;
}
// Write each row
for (int Year = YearMinimum; Year <= YearMaximum; Year++)
{
StatusCallback(Year - YearMinimum, YearMaximum - YearMinimum, person, true);
writer.WriteLine(ReportRow(person, pubs, Year));
}
}
}
else
{
MessageCallback("Skipping " + person.Last + " (" + person.Setnb + ")");
}
}
}
///
/// Write one row of the publications report
///
/// Publication to write
///
public string PubsReportRow(Person person, Publication pub)
{
StringBuilder sb = new StringBuilder();
StringWriter writer = new StringWriter(sb);
// setnb -- unique identifier
writer.Write(person.Setnb);
// pmid -- Unique article identifier
WriteCSV(pub.PMID.ToString(), writer);
// journal_name -- Name of journal
WriteCSV(pub.Journal, writer);
// year -- Year of publication
WriteCSV(pub.Year.ToString(), writer);
// Month -- Month of publication
WriteCSV(pub.Month, writer);
// day -- Day of publication
WriteCSV(pub.Day, writer);
// title -- Article title
WriteCSV(pub.Title, writer);
// Volume -- volume number of the journal in which the article was published
WriteCSV(pub.Volume, writer);
// issue -- Issue in which the article was published
WriteCSV(pub.Issue, writer);
// position -- Position in authorship list
Harvester.AuthorPositions positionType;
int authorPosition = person.GetAuthorPosition(DB, pub, out positionType);
WriteCSV(authorPosition.ToString(), writer);
// nbauthors -- Number of coauthors (including person)
WriteCSV(pub.Authors.Length.ToString(), writer);
// Bin -- From I to IV, based on the PubTypeCategoryID value
int bin = PubTypes.GetCategoryNumber(pub.PubType);
WriteCSV(bin.ToString(), writer);
// Pages -- Page numbers
WriteCSV(pub.Pages, writer);
// Publication_type -- Publication Type from Medline
WriteCSV(pub.PubType, writer);
return sb.ToString();
}
///
/// Write the publications report
///
/// Writer to write the report to
public void PubsReport(ArrayList SetnbsToSkip, StreamWriter writer, ReportStatus StatusCallback, ReportMessage MessageCallback)
{
// Write the header row
string[] columns = { "setnb", "pmid", "journal_name", "year", "Month",
"day", "title", "Volume", "issue", "position", "nbauthors", "Bin",
"Pages", "Publication_type" };
writer.WriteLine(String.Join(",", columns));
// Write the row for each person
People people = new People(DB, PeopleTable);
int Total = people.PersonList.Count;
int Number = 0;
foreach (Person person in people.PersonList)
{
Number++;
StatusCallback(Number, Total, person, false);
// Skip the person if the Setnb is in SetnbsToSkip
if ((SetnbsToSkip == null) || (!SetnbsToSkip.Contains(person.Setnb)))
{
// Get the person's publications -- this will throw an exception
// if there are no publications so catch it and use the message
// callback
Publications pubs = null;
try
{
pubs = new Publications(DB, person, PeoplePublicationsTable, false);
}
catch (Exception ex)
{
MessageCallback("Unable to retrive publications for " + person.Last + " (" + person.Setnb + "): " + ex.Message);
}
if (pubs != null)
{
foreach (Publication pub in pubs.PublicationList)
{
// Write each row
writer.WriteLine(PubsReportRow(person, pub));
}
}
}
else
{
MessageCallback("Skipping " + person.Last + " (" + person.Setnb + ")");
}
}
}
///
/// Write a CSV-friendly string
///
/// String to write
/// Writer to write to
private static void WriteCSV(string data, StringWriter writer)
{
if (data == null)
data = "";
if (data.Contains(","))
{
data = "\"" + data.Replace("\"", "\"\"") + "\"";
}
writer.Write("," + data);
}
///
/// Create a backup of either a People or Publications report, copying
/// rows for all but the last Setnb in the file. Return an array of
/// the Setnbs in the new file.
///
/// ArrayList of Setnbs that were found in the previous file
public static ArrayList BackupReportAndGetSetnbs(string Filename)
{
if (File.Exists(Filename + ".bak"))
File.Delete(Filename + ".bak");
File.Move(Filename, Filename + ".bak");
// Read the setnbs from the file
ArrayList Setnbs = new ArrayList();
string LastSetnb = "";
// First pass through the file: Get the setnbs
StreamReader reader = new StreamReader(Filename + ".bak");
string Line = reader.ReadLine(); // Get the header row
// If the header row doesn't start with "setnb,", this file is invalid
// and should be thrown out.
if ((Line == null) || (!Line.ToLower().StartsWith("setnb,")))
return null;
while ((Line = reader.ReadLine()) != null) {
// If there's a line that doesn't start with a setnb, the file is
// invalid and should be thrown out.
if (Line.Length < 10 || Line.Substring(8, 1) != ",")
return null;
string Setnb = Line.Substring(0, 8);
if ((LastSetnb != "") && (Setnb != LastSetnb))
Setnbs.Add(LastSetnb); // This will make sure the last Setnb isn't added
LastSetnb = Setnb;
}
reader.Close();
reader = new StreamReader(Filename + ".bak");
StreamWriter writer = new StreamWriter(Filename);
// Second pass through the file: Copy all but the last setnb
// This is done so that if the file was interrupted, a partial person
// isn't saved to the report.
Line = reader.ReadLine(); // copy the header row
writer.WriteLine(Line);
while ((Line = reader.ReadLine()) != null)
{
string Setnb = Line.Substring(0, 8);
if (Setnbs.Contains(Setnb))
{
writer.WriteLine(Line);
}
}
reader.Close();
writer.Close();
return Setnbs;
}
///
/// Write the MeSH Heading report
///
/// Writer to send the report to
public void MeSHHeadingReport(StreamWriter writer, ReportStatus StatusCallback, ReportMessage MessageCallback)
{
// Write the header
writer.WriteLine("setnb,year,heading,count");
// The MeSH Heading report has one row per person per year per heading
People people = new People(DB, PeopleTable);
int Total = people.PersonList.Count;
int Count = 0;
foreach (Person person in people.PersonList)
{
// Report status
Count++;
StatusCallback(Count, Total, person, false);
// Catch any errors, report them, and continue
try
{
// Find the minimum and maximum year for the person
int MinYear = 0;
int MaxYear = 0;
Publications pubs = new Publications(DB, person, PeoplePublicationsTable, false);
Hashtable years = new Hashtable();
foreach (Publication pub in pubs.PublicationList)
{
if (MinYear == 0 || MinYear > pub.Year)
MinYear = pub.Year;
if (MaxYear == 0 || MaxYear < pub.Year)
MaxYear = pub.Year;
// Go through each of the MeSH headings and count how many
// occurrences of each heading are in each year. Store each
// count in a hashtable keyed by heading, which in turn is
// stored in a hashtable keyed by year.
if (!years.ContainsKey(pub.Year))
years[pub.Year] = new Hashtable();
Hashtable yearHeadings = (Hashtable)years[pub.Year];
if (pub.MeSHHeadings != null)
{
foreach (string Heading in pub.MeSHHeadings)
{
if (!yearHeadings.ContainsKey(Heading))
yearHeadings[Heading] = 0;
yearHeadings[Heading] = ((int)yearHeadings[Heading]) + 1;
}
}
}
// Write the heading rows for each year
for (int Year = MinYear; Year <= MaxYear; Year++)
{
// Write the rows for that person's year to the writer
if (years.ContainsKey(Year))
{
Hashtable yearHeadings = (Hashtable)years[Year];
if (yearHeadings != null)
{
foreach (string Heading in yearHeadings.Keys)
{
StringWriter swriter = new StringWriter();
swriter.Write(person.Setnb); // setnb
Reports.WriteCSV(Year.ToString(), swriter); // year
Reports.WriteCSV(Heading, swriter); // heading
Reports.WriteCSV(yearHeadings[Heading].ToString(), swriter); // count
writer.WriteLine(swriter.ToString());
}
}
}
}
}
catch (Exception ex)
{
MessageCallback(ex.Message);
}
}
}
///
/// Write the Grants report
///
/// Writer to send the report to
public void GrantsReport(StreamWriter writer)
{
// Write the header row
writer.WriteLine("year,pmid,grant_id");
// Get the data for the grant report
DataTable Result = DB.ExecuteQuery(
@"SELECT p.Year, p.PMID, pg.GrantID
FROM Publications p, PublicationGrants pg
WHERE p.PMID = pg.PMID
AND p.PubTypeCategoryID IN (1,2,3,4)
ORDER BY p.Year, p.PMID"
);
// Write the report to the writer
for (int i = 0; i < Result.Rows.Count; i++)
{
DataRow Row = Result.Rows[i];
StringWriter swriter = new StringWriter();
swriter.Write(Row[0].ToString()); // year
Reports.WriteCSV(Row[1].ToString(), swriter); // PMID
Reports.WriteCSV(Row[2].ToString(), swriter); // grant ID
writer.WriteLine(swriter.ToString());
}
}
}
}