using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;
namespace ScientificDistance
{
///
/// Static class to read data from the input file
///
public static class Files
{
///
/// Header row for the report
///
public const string ReportHeader = "setnb1,range1,setnb2,range2,nb_unq_keywords_1,nb_frq_keywords_1,nb_unq_keywords_2,nb_frq_keywords_2,nb_unq_keywords_ovrlp,nb_frq_keywords_ovrlp";
///
/// Header row for the MeSH report
///
public const string MeSHHeader = "setnb,range,heading,count";
///
/// Header row for the input file
///
public const string InputFileHeader = "setnb1,range1,setnb2,range2";
///
/// Read the rows from an input file and return a list of InputRow objects
///
/// Input file rows to read
/// List of InputRow objects that contains the data from the input file
public static List ReadInput(string[] inputFile)
{
if (inputFile[0] != Files.InputFileHeader)
throw new FormatException("Input file header row is invalid: " + inputFile[0]);
string[] inputFileWithoutHeader = new string[inputFile.Length - 1];
Array.Copy(inputFile, 1, inputFileWithoutHeader, 0, inputFile.Length - 1);
List rows = new List();
int rowNumber = 0;
foreach (string row in inputFileWithoutHeader)
{
rowNumber++;
string[] columns = row.Split(new char[] { ',' });
if (columns.Length != 4)
throw new FormatException("Wrong number of columns in row " + rowNumber + ": " + row);
if (String.IsNullOrEmpty(columns[0]) || String.IsNullOrEmpty(columns[2]))
throw new FormatException("Empty setnb in row " + rowNumber + ": " + row);
rows.Add(new InputRow()
{
Scientist1 = columns[0].Unquote(),
Window1 = columns[1].Unquote(),
Scientist2 = columns[2].Unquote(),
Window2 = columns[3].Unquote(),
});
}
return rows;
}
public static List FaultTolerantCopy(string outputFilename, string meshHeadingReportFilename,
bool rollingWindow, List inputRows)
{
// Rolling windows are not support for fault-tolerant copies, because there's no way
// to tell the difference between an output file that is missing a row and an output
// file that naturally has that row missing because there were no overlapping windows
if (rollingWindow)
throw new OperationCanceledException("Fault tolerance is not supported for rolling windows");
// Work on a new copy of the InputRows list
List newInputRows = new List(inputRows);
// Back up the output files -- this will check that they match, and read their
// contents into outputLines and meshLines (cutting off the last block of each
// and only including those lines that match each other
string[] reportLines;
string[] meshLines;
BackupAndReadOutputFiles(outputFilename, meshHeadingReportFilename, out reportLines, out meshLines);
// Compare the two files, only keep lines that match in both of them
// It doesn't matter whether or not we're using rolling windows because
// the lines should match in either case, and we've already backed out the
// last block. If we reach the end of one file, we'll cut of the other.
MakeSureReportsMatch(ref reportLines, ref meshLines);
// Now that the files match each other and only contain complete blocks, we just need
// to start at the beginning of the input file and match it to the two report files,
// cutting off rows as they go... but make sure we skip the header row, so start at row #1
// for both files by setting reportRow to 1.
int reportRow = 1;
while (reportRow < reportLines.Length)
{
// Skip past next input row
if (!CheckNextReportRow(newInputRows[0], reportLines, reportRow))
throw new OperationCanceledException("Output row doesn't match input file: " + reportLines[reportRow]);
reportRow++;
// Sine the rows match, we can remove the top row from newInputRows
newInputRows.RemoveAt(0);
}
// Write the new files
File.WriteAllLines(outputFilename, reportLines);
File.WriteAllLines(meshHeadingReportFilename, meshLines);
return newInputRows;
}
///
/// Back up the output files and return their contents.
///
/// Name of the output file
/// Name of the mesh heading report file
/// Output that holds the lines of the output file
/// Output that holds the lines of the mesh report file
private static void BackupAndReadOutputFiles(string outputFilename, string meshHeadingReportFilename,
out string[] reportLines, out string[] meshLines)
{
// Back up the old report files
File.Delete(outputFilename + ".bak");
File.Delete(meshHeadingReportFilename + ".bak");
File.Move(outputFilename, outputFilename + ".bak");
File.Move(meshHeadingReportFilename, meshHeadingReportFilename + ".bak");
// Read the lines from the old report files, but cut off the last line in case it's damaged
reportLines = File.ReadAllLines(outputFilename + ".bak");
if (reportLines.Length > 0)
Array.Resize(ref reportLines, reportLines.Length - 1);
// Read the lines from the MeSH report, but cut off the last line
meshLines = File.ReadAllLines(meshHeadingReportFilename + ".bak");
if (meshLines.Length > 0)
Array.Resize(ref meshLines, meshLines.Length - 1);
}
///
/// Make sure the two reports match by reading the counts from the report and
/// matching them up to the MeSH report. Truncate any unmatched rows from the
/// end that may have gotten cut off if the program was terminated early.
///
/// Rows from the report
/// Rows from the MeSH report
private static void MakeSureReportsMatch(ref string[] reportLines, ref string[] meshLines)
{
// First check the header rows
if (reportLines[0] != ReportHeader)
{
reportLines = new string[1];
reportLines[0] = ReportHeader;
}
if (meshLines[0] != MeSHHeader)
{
meshLines = new string[1];
meshLines[0] = MeSHHeader;
}
// We'll compare the files by reading the nb_unq_keywords1 and nb_unq_keywords2 values
// from the report file, which should contain the number of rows in each block in the
// MeSH report. We'll then advance past each block in the MeSH file. If we run into a block
// at the end that doesn't match, we'll roll the MeSH report back.
int reportRow = 1;
int meshRow = 1;
bool finished = false;
while (!finished && reportRow < reportLines.Length && meshRow < meshLines.Length)
{
string[] reportColumns = reportLines[reportRow].Split(new char[] { (',') });
if (reportColumns.Length != 10)
throw new FormatException("Invalid row in report file: " + reportLines[reportRow]);
// Read the MeSH report block for the two scientists
string scientist1 = reportColumns[0];
string window1 = reportColumns[1];
string scientist2 = reportColumns[2];
string window2 = reportColumns[3];
int meshCount1;
if (!int.TryParse(reportColumns[4], out meshCount1))
throw new FormatException("Invalid nb_unq_keywords1 in report file: " + reportLines[reportRow]);
int meshCount2;
if (!int.TryParse(reportColumns[6], out meshCount2))
throw new FormatException("Invalid nb_unq_keywords2 in report file: " + reportLines[reportRow]);
if (meshRow + meshCount1 + meshCount2 < meshLines.Length)
{
for (int i = meshRow; i < meshRow + meshCount1; i++)
{
string row = meshLines[i];
if (!row.StartsWith(scientist1 + "," + window1 + ","))
throw new OperationCanceledException("Row in MeSH report doesn't match report file: " + row);
}
meshRow += meshCount1;
for (int i = meshRow; i < meshRow + meshCount2; i++)
{
string row = meshLines[i];
if (!row.StartsWith(scientist2 + "," + window2 + ","))
throw new OperationCanceledException("Row in MeSH report doesn't match report file: " + row);
}
meshRow += meshCount2;
}
else
finished = true;
if (!finished)
reportRow++;
}
if (reportRow < reportLines.Length)
Array.Resize(ref reportLines, reportRow);
if (meshRow < meshLines.Length)
Array.Resize(ref meshLines, meshRow);
}
private static bool CheckNextReportRow(InputRow inputRow, string[] outputLines, int outputRow)
{
if (outputRow >= outputLines.Length)
return false;
string[] columns = outputLines[outputRow].Split(new char[] { ',' });
if (inputRow.Scientist1 == columns[0]
&& inputRow.Window1 == columns[1]
&& inputRow.Scientist2 == columns[2]
&& inputRow.Window2 == columns[3])
return true;
else
return false;
}
///
/// Extension nethod to undo CSV-style quoting in a string
///
/// Unquoted string
private static string Unquote(this string input)
{
if (input.StartsWith("\""))
if (input.EndsWith("\""))
{
string unquoted = input.Substring(1, input.Length - 2);
return unquoted.Replace("\"\"", "\"");
}
else
throw new FormatException("Unquote() did not find closing quote");
else
return input;
}
}
}