#region License Information
/* HeuristicLab
* Copyright (C) 2002-2015 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
*
* This file is part of HeuristicLab.
*
* HeuristicLab is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* HeuristicLab is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with HeuristicLab. If not, see .
*/
#endregion
using System;
using System.Collections;
using System.Collections.Generic;
using System.Diagnostics.Contracts;
using System.Globalization;
using System.IO;
using System.Linq;
using System.Runtime.Serialization;
using System.Text;
namespace HeuristicLab.Problems.Instances.DataAnalysis {
public class TableFileParser : Progress { // reports the number of bytes read
private const int BUFFER_SIZE = 65536;
// char used to symbolize whitespaces (no missing values can be handled with whitespaces)
private const char WHITESPACECHAR = (char)0;
private static readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t', WHITESPACECHAR };
private Tokenizer tokenizer;
private int estimatedNumberOfLines = 200; // initial capacity for columns, will be set automatically when data is read from a file
private Encoding encoding = Encoding.Default;
public Encoding Encoding {
get { return encoding; }
set {
if (value == null) throw new ArgumentNullException("Encoding");
encoding = value;
}
}
private int rows;
public int Rows {
get { return rows; }
set { rows = value; }
}
private int columns;
public int Columns {
get { return columns; }
set { columns = value; }
}
private List values;
public List Values {
get {
return values;
}
}
private List variableNames;
public IEnumerable VariableNames {
get {
if (variableNames.Count > 0) return variableNames;
else {
string[] names = new string[columns];
for (int i = 0; i < names.Length; i++) {
names[i] = "X" + i.ToString("000");
}
return names;
}
}
}
public TableFileParser() {
variableNames = new List();
}
public bool AreColumnNamesInFirstLine(string fileName) {
NumberFormatInfo numberFormat;
DateTimeFormatInfo dateTimeFormatInfo;
char separator;
DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator);
using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
}
}
public bool AreColumnNamesInFirstLine(Stream stream) {
NumberFormatInfo numberFormat = NumberFormatInfo.InvariantInfo;
DateTimeFormatInfo dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
char separator = ',';
return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
}
public bool AreColumnNamesInFirstLine(string fileName, NumberFormatInfo numberFormat,
DateTimeFormatInfo dateTimeFormatInfo, char separator) {
using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
}
}
public bool AreColumnNamesInFirstLine(Stream stream, NumberFormatInfo numberFormat,
DateTimeFormatInfo dateTimeFormatInfo, char separator) {
using (StreamReader reader = new StreamReader(stream, Encoding)) {
tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
return (tokenizer.PeekType() != TokenTypeEnum.Double);
}
}
///
/// Parses a file and determines the format first
///
/// file which is parsed
///
public void Parse(string fileName, bool columnNamesInFirstLine, int lineLimit = -1) {
NumberFormatInfo numberFormat;
DateTimeFormatInfo dateTimeFormatInfo;
char separator;
DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator);
EstimateNumberOfLines(fileName);
Parse(new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);
}
///
/// Parses a file with the given formats
///
/// file which is parsed
/// Format of numbers
/// Format of datetime
/// defines the separator
///
public void Parse(string fileName, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine, int lineLimit = -1) {
EstimateNumberOfLines(fileName);
using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);
}
}
// determines the number of newline characters in the first 64KB to guess the number of rows for a file
private void EstimateNumberOfLines(string fileName) {
var len = new System.IO.FileInfo(fileName).Length;
var buf = new char[1024 * 1024];
using (var reader = new StreamReader(fileName, Encoding)) {
reader.ReadBlock(buf, 0, buf.Length);
}
int numNewLine = 0;
int charsInCurrentLine = 0, charsInFirstLine = 0; // the first line (names) and the last line (incomplete) are not representative
foreach (var ch in buf) {
charsInCurrentLine++;
if (ch == '\n') {
if (numNewLine == 0) charsInFirstLine = charsInCurrentLine; // store the number of chars in the first line
charsInCurrentLine = 0;
numNewLine++;
}
}
if (numNewLine <= 1) {
// fail -> keep the default setting
return;
} else {
double charsPerLineFactor = (buf.Length - charsInFirstLine - charsInCurrentLine) / ((double)numNewLine - 1);
double estimatedLines = len / charsPerLineFactor;
estimatedNumberOfLines = (int)Math.Round(estimatedLines * 1.1); // pessimistic allocation of 110% to make sure that the list is very likely large enough
}
}
///
/// Takes a Stream and parses it with default format. NumberFormatInfo.InvariantInfo, DateTimeFormatInfo.InvariantInfo and separator = ','
///
/// stream which is parsed
///
public void Parse(Stream stream, bool columnNamesInFirstLine, int lineLimit = -1) {
NumberFormatInfo numberFormat = NumberFormatInfo.InvariantInfo;
DateTimeFormatInfo dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
char separator = ',';
Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);
}
///
/// Parses a stream with the given formats.
///
/// Stream which is parsed
/// Format of numbers
/// Format of datetime
/// defines the separator
///
public void Parse(Stream stream, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine, int lineLimit = -1) {
using (StreamReader reader = new StreamReader(stream, Encoding)) {
tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
values = new List();
if (lineLimit > 0) estimatedNumberOfLines = lineLimit;
if (columnNamesInFirstLine) {
ParseVariableNames();
if (!tokenizer.HasNext())
Error(
"Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).",
"", tokenizer.CurrentLineNumber);
}
// read values... start in first row
int nLinesParsed = 0;
int colIdx = 0;
int numValuesInFirstRow = columnNamesInFirstLine ? variableNames.Count : -1; // number of variables or inizialize based on first row of values (-1)
while (tokenizer.HasNext() && (lineLimit < 0 || nLinesParsed < lineLimit)) {
if (tokenizer.PeekType() == TokenTypeEnum.NewLine) {
tokenizer.Skip();
// all rows have to have the same number of values
// the first row defines how many samples are needed
if (numValuesInFirstRow < 0) numValuesInFirstRow = values.Count; // set to number of colums in the first row
else if (colIdx > 0 && numValuesInFirstRow != colIdx) { // read at least one value in the row (support for skipping empty lines)
Error("The first row of the dataset has " + numValuesInFirstRow + " columns." + Environment.NewLine +
"Line " + tokenizer.CurrentLineNumber + " has " + colIdx + " columns.", "",
tokenizer.CurrentLineNumber);
}
OnReport(tokenizer.BytesRead);
nLinesParsed++;
colIdx = 0;
} else {
// read one value
TokenTypeEnum type; string strVal; double dblVal; DateTime dateTimeVal;
tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
// initialize columns on the first row (fixing data types as presented in the first row...)
if (nLinesParsed == 0) {
values.Add(CreateList(type, estimatedNumberOfLines));
} else if (colIdx == values.Count) {
Error("The first row of the dataset has " + numValuesInFirstRow + " columns." + Environment.NewLine +
"Line " + tokenizer.CurrentLineNumber + " has more columns.", "",
tokenizer.CurrentLineNumber);
}
if (!IsColumnTypeCompatible(values[colIdx], type)) {
values[colIdx] = ConvertToStringColumn(values[colIdx]);
}
// add the value to the column
AddValue(type, values[colIdx++], strVal, dblVal, dateTimeVal);
}
}
if (!values.Any() || values.First().Count == 0)
Error("Couldn't parse data values. Probably because of incorrect number format " +
"(the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
}
this.rows = values.First().Count;
this.columns = values.Count;
// after everything has been parsed make sure the lists are as compact as possible
foreach (var l in values) {
var dblList = l as List;
var byteList = l as List;
var dateList = l as List;
var stringList = l as List;
var objList = l as List