/* * AsciiParser.java * * Created on May 25, 2007, 7:01 AM * * To change this template, choose Tools | Template Manager * and open the template in the editor. */ package org.das2.qds.util; import java.util.logging.Level; import org.das2.datum.Units; import org.das2.datum.UnitsUtil; import org.das2.util.monitor.ProgressMonitor; import java.io.*; import java.text.ParseException; import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.concurrent.atomic.AtomicInteger; import java.util.logging.Logger; import java.util.regex.*; import org.das2.datum.Datum; import org.das2.datum.DatumRange; import org.das2.datum.DatumRangeUtil; import org.das2.datum.EnumerationUnits; import org.das2.datum.InconvertibleUnitsException; import org.das2.datum.TimeParser; import org.das2.qds.DataSetUtil; import org.das2.qds.MutablePropertyDataSet; import org.das2.qds.QDataSet; import org.das2.qds.SparseDataSetBuilder; import org.das2.qds.WritableDataSet; import org.das2.qds.ops.Ops; import org.das2.util.LoggerManager; /** * Class for reading ASCII tables into a QDataSet. This parses a file by breaking * it up into records, and passing the record off to a delegate record parser. * The record parser then breaks up the record into fields, and each field is * parsed by a delegate field parser. Each column of the table has a Unit, field name, * and field label associated with it. * * Examples of record parsers include * DelimParser, which splits the record by a delimiter such as a tab or comma, * RegexParser, which processes each record with a regular expression to get the fields, * and FixedColumnsParser, which splits the record by character positions. * Example of field parsers include DOUBLE_PARSER which parses the value * as a double, and UNITS_PARSER, which uses the Unit attached to the column * to interpret the value. * * When the first record with the correct number of fields is found but is not * parseable, we look for field labels and units. * * The skipLines property tells the parser to skip a given number of header lines * before attempting to parse the record. Also, commentPrefix identifies lines to be * ignored. In either the header or in comments, we look for propertyPattern, and * if a property is matched, then the builder property * is set. Two Patterns are provided NAME_COLON_VALUE_PATTERN and * NAME_EQUAL_VALUE_PATTERN for convenience. * * Adapted to QDataSet model, Jeremy, May 2007. * * @author Jeremy */ public class AsciiParser { private static final Logger logger= LoggerManager.getLogger("qdataset.ascii"); Pattern propertyPattern = null; String commentPrefix = "#"; /** * a java identifier that can be used to identify the column. */ String[] fieldNames; /** * rich headers are put here. */ //AsciiHeadersParser.BundleDescriptor bundleDescriptor; MutablePropertyDataSet bundleDescriptor; /** * units for each column. */ Units[] units; /** * the number of fields which are not nominal data. Since any value * is trivially valid, when deciding if a line is a record or not, this * count should be used. This will be -1 while parsing is configured, * and then 0 or more once parsing has begin. */ int nonEnumFields= -1; /** either the unit or depend 1 value associated with the column, * e.g. Density(cc**-3) or flux_C4(6.4). * @see #units */ String[] fieldUnits; /** * the presentation label for the column. */ String[] fieldLabels; FieldParser[] fieldParsers; final static String numberPart = "[\\d\\.eE\\+\\-]+"; final static String decimalRegex = numberPart; int skipLines; int recordCountLimit = Integer.MAX_VALUE; int recordStart = 0; int fieldCount; private Boolean isRichAscii= null; /** * pattern for name:value. */ public final static Pattern NAME_COLON_VALUE_PATTERN = Pattern.compile("\\s*([a-zA-Z_].*?)\\s*\\:\\s*(.+)\\s*"); /** * pattern for name=value. */ public final static Pattern NAME_EQUAL_VALUE_PATTERN = Pattern.compile("\\s*([a-zA-Z_].*?)\\s*\\=\\s*(.+)\\s*"); /** * detect identifiers for columns. */ Pattern COLUMN_ID_HEADER_PATTERN = Pattern.compile("\\s*\"?([a-zA-Z][a-zA-Z _0-9]*)([\\(\\[]([a-zA-Z_\\!\\.\\[\\-\\]0-9//\\*\\^]*)[\\)\\]])?\"?\\s*"); /** * allow columns to be labeled with some datum ranges, such as 10.0-13.1. We convert these into an identifier, but depend1labels will present as-is. * Note this pattern will match "-999.000" so check groups 2 and 4 for non null. */ private final static Pattern COLUMN_CHANNEL_HEADER_PATTERN = Pattern.compile("\\s*\"?(([a-zA-Z_]*)(\\d*\\.?\\d*([eE]\\d+)?)\\-(\\d*\\.?\\d*([eE]\\d+)?))\"?\\s*"); public final static String PROPERTY_FIELD_NAMES = "fieldNames"; public static final String PROPERTY_FILE_HEADER = "fileHeader"; public static final String PROPERTY_FIRST_RECORD = "firstRecord"; public static final String PROPERTY_FIELD_PARSER = "fieldParser"; public static final String DELIM_COMMA = ","; public static final String DELIM_TAB = "\t"; public static final String DELIM_WHITESPACE = "\\s+"; private static final int HEADER_LENGTH_LIMIT=1000; /** * Convenient unit for parsing UTC times. */ public static final Units UNIT_UTC= Units.t2000; StringBuffer headerBuffer = new StringBuffer(); private AsciiParser(String[] fieldNames) { setRegexParser(fieldNames); } /** * returns true if the line is a header or comment. * @param iline the line number in the file, starting with 0. * @param lastLine the last line read. * @param thisLine the line we are testing. * @param recCount the number of records successfully read. * @return true if the line is a header line. */ public final boolean isHeader(int iline, String lastLine, String thisLine, int recCount) { return (iline < skipLines || (headerDelimiter != null && recCount == 0 && (lastLine == null || !Pattern.compile(headerDelimiter).matcher(lastLine).find())) || (commentPrefix != null && thisLine.startsWith(commentPrefix)) ); } /** * quick-n-dirty check to see if a string appears to be an ISO8601 time. * minimally 2000-002T00:00, but also 2000-01-01T00:00:00Z etc. * Note that an external code may explicitly indicate that the field is a time, * This is just to catch things that are obviously times. * @param s * @return true if this is clearly an ISO time. */ public final boolean isIso8601Time( String s ) { if ( s.length()>13 && s.contains("T") ) { if ( ( s.startsWith("20") || s.startsWith("19") || s.startsWith("18") ) && Character.isDigit(s.charAt(2)) && Character.isDigit(s.charAt(3)) ) { int charCount=4; for ( int i=4; i10; } else { return false; } } else { return false; } } /** * return the first record that the parser would parse. If skipLines is * more than the total number of lines, or all lines are comments, then null * is returned. * * @param filename * @return the first line after skip lines and comment lines. * @throws java.io.IOException */ public String readFirstRecord(String filename) throws IOException { return readFirstRecord(new BufferedReader(new FileReader(filename))); } /** * return the first line of the freshly opened file. The reader * is closed. * @param reader * @return * @throws java.io.IOException */ public String readFirstRecord(BufferedReader reader) throws IOException { String line; String lastLine = null; int iline = 0; line = reader.readLine(); while (line != null && isHeader(iline, lastLine, line, 0)) { lastLine = line; line = reader.readLine(); iline++; } reader.close(); return line; } /** * returns the first record that the record parser parses successfully. The * recordParser should be set and configured enough to identify the fields. * If no records can be parsed, then null is returned. * * The first record should be in the first 1000 lines. * * @param filename * @return the first parseable line, or null if no such line exists. * @throws java.io.IOException */ public String readFirstParseableRecord(String filename) throws IOException { String line; try (BufferedReader reader = new LineNumberReader(new FileReader(filename))) { String lastLine = null; line = reader.readLine(); int iline = 0; while (line != null && isHeader(iline, lastLine, line, 0)) { lastLine = line; line = reader.readLine(); iline++; } DataSetBuilder builder = new DataSetBuilder(2, 100, recordParser.fieldCount() ); // check for iso8601 times in the first two columns. if ( UnitsUtil.isTimeLocation(this.units[0]) ) this.fieldParsers[0]= UNITS_PARSER; if ( recordParser.fieldCount()>1 && this.units.length>1 && UnitsUtil.isTimeLocation(this.units[1]) ) this.fieldParsers[1]= UNITS_PARSER; while (line != null && iline50 ) { return currentFirstRecord; } line = reader.readLine(); } } finally { if ( reader!=null ) reader.close(); } return currentFirstRecord; } /** * read in records, allowing for a header of non-records before * guessing the delim parser. This will return a reference to the * DelimParser and set skipLines. DelimParser header field is set as well. * One must set the record parser explicitly. * @param filename * @return the record parser to use, or null if no records are found. * @throws java.io.IOException */ public DelimParser guessSkipAndDelimParser( String filename ) throws IOException { Logger logger= LoggerManager.getLogger("qdataset.ascii.guess"); BufferedReader reader = null; DelimParser result= null; try { reader = new BufferedReader( new FileReader(filename) ); String line; String lastLine = null; line = reader.readLine(); int iline = 0; if ( line==null ) { throw new IllegalArgumentException("File is empty: "+filename); } if ( line.length()>1 ) { if ( line.charAt(0)==0 ) throw new IllegalArgumentException("ASCII file cannot start with 0: "+filename); } headerBuffer= new StringBuffer(); // skip over the beginning lines which are explicitly marked as // headers with a marker (like #) or with the skip lines control. while (line != null && isHeader(iline, lastLine, line, 0)) { lastLine = line; if ( iline lines= new LinkedList<>(); int parseCount=0; // Find a line with a record parser consistent with five other lines. while ( iline10 ) { lines.remove(0); } if ( line!=null ) { // for the delimParser guessed by this line, how many of the last ten lines parse? p= guessDelimParser(line,iline); int enumCount= 0; int totalCount= 0; for ( int i=0; i0 ) { String ss= header.substring(0,ii); if ( ss.split("\\#").length>2 ) { throw new IllegalArgumentException("rich header cannot contain more than two hashes (#) on the first line. Maybe newlines were unintentionally removed"); } } bundleDescriptor = AsciiHeadersParser.parseMetadata(header, fieldNames, fieldLabels ); if ( bundleDescriptor.length()==fieldNames.length ) { for ( int j=0; j1 ? line.charAt(ich-1)==',' : false; if ( !afterComma ) { tabDelimFieldCount+= withinQuote ? 0 : 1; } withinWhitespace=true; afterEscape= false; break; case ' ': afterComma= ich>1 ? line.charAt(ich-1)==',' : false; if ( !( withinWhitespace || afterComma ) ) { withinWhitespace=true; whitespaceDelimFieldCount+= withinQuote ? 0 : 1; } afterEscape= false; break; case ';': semiColonDelimFieldCount+= withinQuote ? 0 : 1; afterEscape= false; withinWhitespace=false; break; case ',': commaDelimFieldCount+= withinQuote ? 0 : 1; afterEscape= false; withinWhitespace=false; break; case '\\': afterEscape= true; withinWhitespace=false; break; case '"': if ( !afterEscape ) { withinQuote= !withinQuote; } afterEscape= false; withinWhitespace=false; break; default: afterEscape= false; withinWhitespace=false; break; } } if ( semiColonDelimFieldCount > 1 && semiColonDelimFieldCount>=whitespaceDelimFieldCount/2 ) { fieldSep = ";"; } else if ( tabDelimFieldCount > 1 && tabDelimFieldCount!=whitespaceDelimFieldCount ) { // always use tabs over others, but only if other doesn't work fieldSep = "\t"; } else if ( commaDelimFieldCount > 1 && commaDelimFieldCount>= whitespaceDelimFieldCount/2 ) { //TODO: improve this fieldSep = ","; } else { fieldSep = "\\s+"; } logger.log(Level.FINER, "guessDelimParser guesses \"{0}\" for line {1}", new Object[]{fieldSep, lineNumber}); DelimParser result = createDelimParser(line, fieldSep, lineNumber); this.setRecordParser( result ); return result; } /** * The DelimParser splits each record into fields using a delimiter like "," * or "\\s+". * * @param filename filename to read in. * @param delim the delimiter, such as "," or "\t" or "\s+" * @return the record parser that will split each line into fields * @throws java.io.IOException */ public DelimParser setDelimParser(String filename, String delim) throws IOException { FileReader r= null; DelimParser result=null; try { r= new FileReader(filename); result= setDelimParser(r, delim); } finally { if ( r!=null ) r.close(); } return result; } /** * The DelimParser splits each record into fields using a delimiter like "," * or "\\s+". * @param in * @param delimRegex the delimiter, such as "," or "\t" or "\s+" * @return the record parser that will split each line into fields * @throws java.io.IOException */ public DelimParser setDelimParser(Reader in, String delimRegex) throws IOException { String line; try (BufferedReader reader = new LineNumberReader(in)) { line = readFirstRecord(reader); } DelimParser result = createDelimParser(line, delimRegex, -1); this.setRecordParser( result ); return result; } /** * The regex parser is a slow parser, but gives precise control. * @param fieldNames * @return the parser for each record. */ public final RecordParser setRegexParser(String[] fieldNames) { initializeByFieldCount(fieldNames.length); this.fieldNames = Arrays.copyOf( fieldNames, fieldNames.length ); StringBuilder regexBuf = new StringBuilder(); regexBuf.append("\\s*"); for (int i = 0; i < fieldCount - 1; i++) { regexBuf.append("(" + decimalRegex + ")[\\s+,+]\\s*"); } regexBuf.append("(" + decimalRegex + ")\\s*"); this.setRecordParser( new RegexParser(this,regexBuf.toString()) ); return recordParser; } /** * looks at the first line after skipping, and splits it to calculate where * the columns are. The FixedColumnsParser is the fastest of the three parsers. * * @param filename filename to read in. * @param delim regex to split the initial line into the fixed columns. * @return the record parser that will split each line. * @throws java.io.IOException */ public FixedColumnsParser setFixedColumnsParser(String filename, String delim) throws IOException { Reader r=null; FixedColumnsParser result; try { r= new FileReader(filename); result= setFixedColumnsParser( r, delim); } finally { if ( r!=null ) r.close(); } return result; } /** * looks at the first line after skipping, and splits it to calculate where * the columns are. * * @param in the Reader to get lines from. * @param delim regex to split the initial line into the fixed columns. * @return the record parser that will split each line. * @throws java.io.IOException */ public FixedColumnsParser setFixedColumnsParser(Reader in, String delim) throws IOException { String line; int lineNumber; try (LineNumberReader reader = new LineNumberReader(in)) { line = readFirstRecord(reader); lineNumber= reader.getLineNumber(); } int[] columnOffsets; int[] columnWidths; int col = 0; String[] ss = line.split(delim); columnOffsets = new int[ss.length]; columnWidths = new int[ss.length - 1]; initializeByFieldCount(ss.length); initializeUnitsByGuessing(ss,lineNumber); boolean rightJustified = false; if (ss[0].trim().length() == 0) { rightJustified = true; for (int i = 0; i < ss.length - 1; i++) { ss[i] = ss[i + 1]; } } columnOffsets[0] = 0; if (rightJustified) { for (int i = 1; i < ss.length; i++) { col = line.indexOf(ss[i - 1], columnOffsets[i - 1]); columnOffsets[i] = col + ss[i - 1].length(); columnWidths[i - 1] = columnOffsets[i] - columnOffsets[i - 1]; } } else { for (int i = 1; i < ss.length; i++) { col = line.indexOf(ss[i], col + ss[i - 1].length()); // account for whitespace columnOffsets[i] = col; columnWidths[i - 1] = columnOffsets[i] - columnOffsets[i - 1]; } } int[] co = new int[columnWidths.length]; System.arraycopy(columnOffsets, 0, co, 0, columnWidths.length); FixedColumnsParser p = new FixedColumnsParser(co, columnWidths); this.setRecordParser(p); this.propertyPattern = null; return p; } /** * return the field count that would result in the largest number of records parsed. The * entire file is scanned, and for each line the number of decimal fields is counted. At the end * of the scan, the fieldCount with the highest record count is returned. * @param filename the file name, a local file opened with a FileReader * @return the apparent field count. * @throws java.io.FileNotFoundException */ public static int guessFieldCount(String filename) throws FileNotFoundException, IOException { final int maxFieldCount = 10; // can only identify maxFieldCount - 1. int[] recCount = new int[maxFieldCount]; StringBuilder regexBuf = new StringBuilder(); regexBuf.append("\\s*(" + decimalRegex + ")"); for (int i = 1; i < maxFieldCount; i++) { regexBuf.append("([\\s+,+]\\s*(" + decimalRegex + "))?"); } regexBuf.append("\\s*"); Pattern pat = Pattern.compile(regexBuf.toString()); try (BufferedReader reader = new LineNumberReader(new FileReader(filename))) { String line; while ((line = reader.readLine()) != null) { Matcher m = pat.matcher(line); if (m.matches()) { int j; for (j = 1; j < m.groupCount(); j += 2) { if (m.group(j) == null) { recCount[(j - 1) / 2]++; break; } } } } } int max = 0; int imax = 0; for (int j = 1; j < maxFieldCount; j++) { if (recCount[j] > max) { imax = j; max = recCount[j]; } } return imax; } /** * set the special parser for a field. * @param field the field number, 0 is the first column. * @param fp the parser */ public void setFieldParser(int field, FieldParser fp) { FieldParser oldFp = this.fieldParsers[field]; this.fieldParsers[field] = fp; if (fp == UNITS_PARSER && UnitsUtil.isTimeLocation(units[field])) { setPropertyPattern(null); } propertyChangeSupport.firePropertyChange(PROPERTY_FIELD_PARSER, oldFp, fp); } /** * creates a parser with @param fieldCount fields, named "field0,...,fieldN" * @param fieldCount the number of fields * @return the file parser */ public static AsciiParser newParser(int fieldCount) { String[] fieldNames = new String[fieldCount]; for (int i = 0; i < fieldCount; i++) { fieldNames[i] = "field" + i; } return new AsciiParser(fieldNames); } /** * creates a parser with the named fields. * @param fieldNames the names for each field * @return the file parser */ public static AsciiParser newParser(String[] fieldNames) { return new AsciiParser(fieldNames); } /** * skip a number of lines before trying to parse anything. This can be * set to point at the first valid line, and the RecordParser will be * configured using that line. * @param skipLines */ public void setSkipLines(int skipLines) { this.skipLines = skipLines; } /** * limit the number of records read. parsing will stop once this number of * records is read into the result. This is Integer.MAX_VALUE by default. * @param recordCountLimit */ public void setRecordCountLimit(int recordCountLimit) { this.recordCountLimit = recordCountLimit; if ( this.recordStart>0 ) { this.recordCountLimit+= this.recordStart; } } /** * set the number of records to skip before accumulating the result. * @param recordStart */ public void setRecordStart(int recordStart) { if ( recordStart<0 ) throw new IllegalArgumentException("must be positive"); this.recordStart= recordStart; if ( this.recordCountLimit3 ) { int nonAsciiCount= getNonAsciiCount(line); if ( nonAsciiCount>20 || nonAsciiCount*100/line.length()>20 ) { throw new IOException("stream does not appear to be ascii"); } } firstRecord = line.length()>132 ? ( line.substring(0,132)+"..." ) : line; builder.putProperty(PROPERTY_FIRST_RECORD, firstRecord); if ( line.length()>1 && ((int)line.charAt(0))==0xFEFF ) { //Excel UTF non-space line= line.substring(1); } } // *** here's where we parse each record *** if (recordParser.tryParseRecord(line, irec, builder)) { acceptRecord= true; if ( whereParm!=null ) { String[] fields= new String[recordParser.fieldCount()]; if ( recordParser.splitRecord(line,fields) ) { String field= fields[iwhereParm].trim(); int icomp= whereComp.compare( field, whereValue ); acceptRecord= false; if ( whereEq && icomp==0 ) { acceptRecord= true; } else if ( whereNe && icomp!=0 ) { acceptRecord= true; } else if ( whereSign==icomp ) { acceptRecord= true; } } } if ( acceptRecord ) { irec++; builder.nextRecord(); } } else { //System.out.println(line); } } catch (NumberFormatException e) { logger.log(Level.SEVERE, e.getMessage(), e); } } lastLine = line; line = recordParser.readNextRecord(reader); } mon.finished(); Object o= builder.properties.get( QDataSet.USER_PROPERTIES ); if ( o==null ) { builder.putProperty(QDataSet.USER_PROPERTIES, new HashMap(builder.properties)); // put discovered properties into } if ( bundleDescriptor!=null ) { // it shouldn't be null. builder.putProperty( QDataSet.BUNDLE_1, bundleDescriptor ); } WritableDataSet result= builder.getDataSet(); if ( recordStart>0 ) { result= (WritableDataSet)result.trim(recordStart,result.length()); } if ( acceptRecord==false ) { result= (WritableDataSet)result.trim(0,result.length()-1); } return result; } /** * return true if the header appears to contain JSON code which could be * interpreted as a "Rich Header" (a.k.a. JSONHeadedASCII). This is * a very simple test, simply looking for #{ and #} * with a colon contained within. * @see https://github.com/JSONheadedASCII/examples * @param header string containing the commented header. * @return true if parsing as a Rich Header should be attempted. */ public static boolean isRichHeader( String header ) { if ( header.length()==0 ) return false; String hash= header.charAt(0)=='#' ? "\\#" : ""; // we might have popped off all the comment prefixes (#). Pattern p= Pattern.compile(hash+"\\s*\\{"); Matcher m= p.matcher(header); if ( m.find() ) { int istart= m.start(); int iend= m.end(); p= Pattern.compile(hash+".*\\}"); m= p.matcher(header); if ( m.find( iend ) ) { iend= m.end(); String jsonSrc= header.substring(istart,iend); return jsonSrc.contains(":"); } } return false; } /** * return true if the parsed file provided a rich ascii header. Presently * this is the header defined in http://autoplot.org/richAscii. This must * be called after the file is parsed. * @return true if the parsed file provided a rich ascii header. */ public boolean isRichHeader() { if ( this.isRichAscii==null ) { throw new IllegalArgumentException("file must be parsed before calling isRichHeader"); } return this.isRichAscii; } /** * attempt to parse the metadata in the headers. If the header contains * a pair of braces {}, then we assume it's a special JSON-formatted header * with QDataSet metadata for the UNITS and LABELs. If not, then we * just look for name/value pairs as specified by the propertyPattern. * * In the JSON case, we form a bundle descriptor (a special QDataSet) which * contains the properties for each bundled dataset. For the default case, * we assign the values to the USER_PROPERTIES. * @param header * @param builder */ private void parseMeta( String header, DataSetBuilder builder ) { boolean doJSON= isRichHeader(header); if ( doJSON ) { try { //System.err.println( "== JSON Header == \n"+header ); logger.fine("Parsing Rich JSON Header..."); bundleDescriptor = AsciiHeadersParser.parseMetadata(header, getFieldNames(), getFieldLabels() ); builder.putProperty( QDataSet.BUNDLE_1, bundleDescriptor ); bundleDescriptor.property(QDataSet.LABEL, 1); //move dimensionless properties to the dataset. Map props= DataSetUtil.getProperties( bundleDescriptor, DataSetUtil.globalProperties(), null ); for ( Entry e: props.entrySet() ) { String k= e.getKey(); builder.putProperty( k, e.getValue() ); } for ( int j=0; j userProps= new LinkedHashMap(); for ( String line2: header.split("\n") ) { Matcher m2= propertyPattern.matcher(line2); if ( m2.matches() ) { userProps.put( m2.group(1).trim(), m2.group(2).trim() ); } } builder.putProperty( QDataSet.USER_PROPERTIES, userProps ); } } } else { if ( propertyPattern!=null ) { Map userProps= new LinkedHashMap(); for ( String line2: header.split("\n") ) { Matcher m2= propertyPattern.matcher(line2); if ( m2.matches() ) { userProps.put( m2.group(1).trim(), m2.group(2).trim() ); } } builder.putProperty( QDataSet.USER_PROPERTIES, userProps ); } SparseDataSetBuilder sdsb= new SparseDataSetBuilder(2); sdsb.setQube( new int[] { units.length, 0 } ); for ( int i=0; i getRichFields() { LinkedHashMap result= new LinkedHashMap<>(); if ( bundleDescriptor!=null ) { for ( int i=0; i0 ) { len=1; for ( int j=0; j126 ) && ch!=9 ) nonAsciiCount++; } return nonAsciiCount; } public static interface RecordParser { /** * return the next record in a String, or null of no more records exist. * @param reader * @return * @throws IOException */ String readNextRecord( BufferedReader reader ) throws IOException; /** * returns true if the line appears to be a record. If it is a record, * then the record is inserted into the builder. * @param line the line from the file. * @param irec the record number * @param builder the builder into which the data is inserted. * @return true if the line appeared to be a record. */ boolean tryParseRecord(String line, int irec, DataSetBuilder builder); /** * indicate the number of fields this RecordParser is * expecting on each line. * @return the field count. */ int fieldCount(); /** * return the number of fields in this line. All records will have this * number of fields. This is used for discovery and to configure the parser. * @param line the line from the file, to attempt parsing. * @return the number of fields found. */ int fieldCount(String line); /** * attempts to extract fields from the record, returning true if * the record could be split. * @param line the line from the file. * @param fields array to store the fields. fieldCount() should be used * to determine the length of the array. * @return true if the line is a record that can be split into fields. */ boolean splitRecord( String line, String[] fields ); } /** * A FieldParser takes character data and returns a number representing * the data. The units of the field are often used with this when parsing. */ public static interface FieldParser { /** * parse the field into a double representing * @param field the field * @param columnIndex the column index. * @return the double representing * @throws ParseException */ double parseField(String field, int columnIndex) throws ParseException; } /** * parses the field using Double.parseDouble, Java's double parser. */ public static final FieldParser DOUBLE_PARSER = new FieldParser() { @Override public final double parseField(String field, int columnIndex) { if ( field.length()==1 ) { double d= field.charAt(0)-'0'; // bugfix '+' caused exception http://www.dartmouth.edu/~rdenton/Data/DentonTakahashiGOES1980-1991MassDensityWithHeader.txt?skipLines=91&depend0=FractionlYear&column=AE return ( d<0 || d>9 ) ? Double.NaN : d; } else { return Double.parseDouble(field); } } @Override public String toString() { return "doubleParser"; } }; /** * delegates to the unit object set for this field to parse the data. */ public final FieldParser UNITS_PARSER = new FieldParser() { @Override public final double parseField(String field, int columnIndex) throws ParseException { Units u = AsciiParser.this.units[columnIndex]; return u.parse(field).doubleValue(u); } @Override public String toString() { return "unitsParser"; } }; /** * uses the EnumerationUnits for the field to create a Datum. */ public final FieldParser ENUMERATION_PARSER = new FieldParser() { @Override public final double parseField(String field, int columnIndex) throws ParseException { Units units= AsciiParser.this.units[columnIndex]; if ( !( units instanceof EnumerationUnits ) ) { throw new IllegalStateException("ENUMERATION_PARSER needed EnumerationUnits"); } EnumerationUnits u = (EnumerationUnits)units; field= field.trim(); try { Datum d= u.createDatum(field); // rte_2038937185 that Ivar sees. return d.doubleValue(u); } catch ( NullPointerException ex ) { throw ex; // w/Ivar } } @Override public String toString() { return "enumerationParser"; } }; /** * hide the nuances of Java's split function. When the string endswith the * regex, add an empty field. Also, trim the string so leading and trailing * whitespace is not treated as a delimiter. Last, we need to guard against * commas within a string, so a,b,",",d,e has 5 fields. * @param string * @param regex regular expression like \\s+ * @return string array containing the fields. */ private static String[] split(String string, String regex) { String[] ss; if ( regex.equals("\\s+") ) { ss= string.trim().split(regex); // do what you did before. } else { switch (regex) { case ",": case ";": if ( (int)(string.charAt(string.length()-1))==8221 ) { logger.finer("trailing right quote detected"); string= string.substring(0,string.length()-1)+"\""; } ss= string.trim().split( regex + "(?=([^\"]*\"[^\"]*\")*[^\"]*$)",-2); break; default: ss= string.trim().split(regex,-2); break; } } return ss; } /** * create a delimeter-based record parser by splitting the line and counting * the number of fields. If the line appears to contain column headings, * then column names will be set as well. * This has the side effect of turning off property record pattern. * Trailing and leading whitespace is ignored. * @param line a record to parse. * @param fieldSep separating regex such as "," or "\t" or "\s+" * @param lineNum the line number, 1 is first line, used for debugging, * -1 means the one used for parsing in production. * @return */ private DelimParser createDelimParser(String line, String fieldSep, int lineNum) { logger.entering( "AsciiParser", "createDelimParser" ); String[] ss = split(line.trim(), fieldSep); initializeByFieldCount(ss.length); initializeUnitsByGuessing(ss,lineNum); fieldLabels= new String[fieldCount]; fieldUnits= new String[fieldCount]; boolean isColumnHeaders = true; for (int i = 0; i < ss.length; i++) { Matcher m; m = COLUMN_ID_HEADER_PATTERN.matcher(ss[i]); if (m.matches()) { String n= m.group(1).trim(); if ( n.length()!=3 || !n.equalsIgnoreCase("nan") ) { fieldLabels[i] = n; fieldNames[i] = Ops.safeName( fieldLabels[i] ); fieldUnits[i]= m.group(3); if (fieldUnits[i]!=null) { fieldUnits[i]= fieldUnits[i].trim(); if ( fieldUnits[i].length()>2 ) { char ch= fieldUnits[i].charAt(0); if ( !Character.isLetter(ch) ) { // this can't be turned into a unit, so just tack this on to the label. fieldLabels[i]= fieldLabels[i] + m.group(2); fieldUnits[i]= null; } } } } else { if (isColumnHeaders) { logger.log(Level.FINEST, "parsed line appears to contain NaN''s, and is not a column header because of field #{0}: {1}", new Object[]{i, ss[i]}); } isColumnHeaders = false; } } else if ((m=COLUMN_CHANNEL_HEADER_PATTERN.matcher(ss[i])).matches() && m.group(3).length()>0 && m.group(5).length()>0 ) { String n= m.group(1).trim(); fieldLabels[i] = n; if ( m.group(2).length()>0 ) { // make valid java identifier fieldNames[i] = n.replaceAll("-", "_"); } else { fieldNames[i] = "ch_"+n.replaceAll("-", "_"); } fieldUnits[i]= null; } else { if (isColumnHeaders) { logger.log(Level.FINEST, "first parsed line does not appear to be column header because of field #{0}: {1}", new Object[]{i, ss[i]}); } isColumnHeaders = false; } } if (!isColumnHeaders) { for (int i = 0; i < fieldCount; i++) { if (fieldNames[i] == null) { fieldNames[i] = "field" + i; } } //TODO: this will clean up cases where we get extraneous headers. // int fieldNameCount=0; // for (int i = 0; i < fieldCount; i++) { // if (fieldNames[i] != null) { // fieldNameCount++; // } // } // if ( fieldNameCount=fieldCount/2 ) { guessUnits= false; parseMeta( "", builder ); // we must reset the bundle descriptor } } Exception firstException= null; for (j = 0; j < fieldCount; j++) { tryCount++; if (doParseField[j]) { String parseable = ss[j]; try { double d= fieldParsers[j].parseField(parseable, j); if ( builder!=null ) builder.putValue(irec, j, d ); okayCount++; } catch (ParseException | NumberFormatException e) { if ( irec==0 ) { logger.fine("ignore fails on the first line"); failCount++; } else { if ( firstException==null ) firstException= e; failCount++; } if ( builder!=null ) builder.putValue(irec, j, -1e31 ); //TODO } //TODO } } if ( firstException!=null && failCount>0 && failCount-1 ) { if ( guessUnits ) return false; // we're still trying to figure out the units. if ( ( failCount < tryCount ) && ( okayCount > ( enumFieldCount + 1 ) || (failCount < 3-enumFieldCount ) ) ) { return true; } else { return false; } } else { return ( failCount < tryCount ) && ( okayCount > 1 || failCount < 3 ); } } @Override public int fieldCount() { return fieldCount; } @Override public int fieldCount(String line) { return fields(line).length; } public void setSkipField(int ifield, boolean skip) { this.doParseField[ifield] = !skip; } /** * return the string for each field. This is useful * for discovery, and is not used in the bulk parsing. * @param line * @return */ private String[] fields(String line) { String[] many= new String[1000]; splitRecord(line, many); int count=0; for ( int i=0; i0 && s.charAt(0)=='"' && s.charAt(n-1)=='"' ) { fields[i]= s.substring(1,n-1); } } return ( ifield == fields.length && index==len ) ; } @Override public String toString() { return "AsciiParser.DelimParser: delim="+this.delimRegex + " fieldCount="+this.fieldCount+ " serialNumber="+this.serialNumber; } } /** * convert F77 style to C style. * X3,I3,F8 -> 3x,3i,8f * Repeats are not supported. * @param format * @return * @see org.autoplot.metatree.MetadataUtil#normalizeFormatSpecifier */ private static String[] f77FormatToCFormat( String[] format ) { String[] ss= new String[format.length+1]; for ( int i=1;i1 ) {// I3 -> 3i; Pattern p= Pattern.compile( "(\\d*)(\\D)(\\d*).*"); // the .* is for F8.3 so the .3 is ignored Matcher m= p.matcher(field); if ( m.matches() ) { String type= m.group(2); int repeat= !m.group(1).equals("") ? Integer.parseInt(m.group(1)) : 1; int len= !m.group(3).equals("") ? Integer.parseInt(m.group(3)) : -1; if ( type.toLowerCase().equals("x") ) { if ( len==-1 ) len= repeat; else len= repeat*len; ss[i]= String.valueOf(len) + type; } else { if ( repeat!=1 ) { throw new IllegalArgumentException("repeats are only allowed for X: "+field); } else { ss[i]= String.valueOf(len) + type; } } } else { throw new IllegalArgumentException("unable to parse: "+field); } } else { ss[i]= field; } } ss[0]=""; return ss; } /** * Convert FORTRAN (F77) style format to C-style format specifiers. * @param format for example "%5d%5d%9f%s" * @return for example "d5,d5,f9,a" * @see org.autoplot.metatree.MetadataUtil#normalizeFormatSpecifier */ public static String getRegexForFormat( String format ) { String[] ss= format.split("%"); if ( ss.length==1 ) { // support $ as well as %, since % is not nice in URIs. String[] ss1= format.split("\\$"); if ( ss1.length>1 ) ss= ss1; } if ( ss.length==1 ) { String[] ss2= format.split(","); //FORTRAN style F if ( ss2.length>1 ) { ss= f77FormatToCFormat( ss2 ); } } // int count= 0; // for (String s : ss) { // if (!s.toLowerCase().endsWith("x")) { // count++; // } // } //String[] fc = new String[count]; int[] lengths = new int[ss.length]; for (int i = 0; i < lengths.length; i++) { lengths[i] = -1; // -1 indicates not known, but we'll figure out as many as we can. } //String[] delim = new String[count + 1]; StringBuilder build = new StringBuilder(100); //delim[0] = ss[0]; //int ifield= 0; for (int i = 1; i < ss.length; i++) { int pp = 0; while (Character.isDigit(ss[i].charAt(pp)) || ss[i].charAt(pp) == '-') { pp++; } if (pp > 0) { lengths[i] = Integer.parseInt(ss[i].substring(0, pp)); } else { lengths[i] = -1; // determine later by field type } logger.log( Level.FINE, "ss[i]={0}", ss[i] ); String fci; if ( ss[i].toLowerCase().endsWith("x") ) { if ( lengths[i]==-1 ) { fci= "\\s*\\S+"; } else { //fc[i]= "(" + "...................................................................".substring(0,lengths[i]) + ")"; fci= "" + ".{"+lengths[i]+"}"; } } else { if ( lengths[i]==-1 ) { fci= "\\s*(\\S+)"; } else { //fc[i]= "(" + "...................................................................".substring(0,lengths[i]) + ")"; fci= "(" + ".{"+lengths[i]+"})"; } //fc[ifield++]= fci; } build.append(fci); if ( lengths[i]==-1 ) build.append("\\s*"); } String regex= build.toString(); //System.err.println( "regex= "+ regex ); return regex; } /** * see {@code private TimeParser(String formatString, Map fieldHandlers)}, * which is very similar.
    *
  • "%5d%5d%9f%s" *
  • "d5,d5,f9,a" *
* @param format * @return * @see org.das2.datum.TimeParser */ public RegexParser getRegexParserForFormat(String format) { String regex= getRegexForFormat(format); RegexParser rp= new RegexParser(this,regex); setRecordParser(rp); return rp; } /** * return a regex parser for the given regular expression. Groups are used * for the fields, for example getRegexParser( 'X (\d+) (\d+)' ) would * parse lines like "X 00005 00006". * @param regex * @return the regex parser */ public RegexParser getRegexParser( String regex ) { return new RegexParser(this,regex); } /** * This initializes the parser, setting: *
  • fieldCount *
  • fieldNames to "field"+i *
  • fieldParsers to DOUBLE_PARSER *
  • units to Units.dimensionless. * @param count */ private void initializeByFieldCount( int count ) { fieldCount= count; fieldNames = new String[fieldCount]; fieldParsers = new FieldParser[fieldCount]; fieldLabels= new String[fieldCount]; fieldUnits= new String[fieldCount]; units = new Units[fieldCount]; //this is the one place where units array is initialized for (int i = 0; i < fieldCount; i++) { fieldParsers[i] = DOUBLE_PARSER; fieldNames[i] = "field" + i; fieldLabels[i] = fieldNames[i]; fieldUnits[i] = ""; _setUnits( i, Units.dimensionless ); } } private static Units guessUnits( String sval ) { if ( sval.length()>0 && sval.charAt(0)=='"' && sval.charAt(sval.length()-1)=='"' ) { sval= sval.substring(1,sval.length()-1); } try { Units.dimensionless.parse(sval); return Units.dimensionless; } catch ( ParseException ex ) { logger.log(Level.FINER, "fails to parse as number: {0}", sval); } try { AsciiParser.UNIT_UTC.parse(sval); return AsciiParser.UNIT_UTC; } catch ( ParseException ex ) { logger.log(Level.FINER, "fails to parse as time: {0}", sval); } return EnumerationUnits.create("enum"); } /** * initialize the units by guessing at each field. This will * only switch between dimensionless and UTC times. * @param ss the fields. * @param lineNumber the line number for reference when debugging. */ private void initializeUnitsByGuessing( String[] ss, int lineNumber ) { boolean useOldCode=false; if (useOldCode) { initializeUnitsByGuessingOld(ss, lineNumber); } else { logger.log(Level.FINE, "guess units at line {0}", lineNumber); for (int i = 0; i < ss.length; i++) { String field= ss[i].trim(); if ( field.length()==0 ) continue; Units u= guessUnits(field); if ( UnitsUtil.isTimeLocation(u) ) { _setUnits( i, Units.t2000 ); fieldParsers[i] = UNITS_PARSER; } else if ( u==Units.dimensionless ) { _setUnits( i, u ); fieldParsers[i] = DOUBLE_PARSER; } else if ( u instanceof EnumerationUnits ) { _setUnits( i, u ); fieldParsers[i] = ENUMERATION_PARSER; } else { _setUnits( i, u ); fieldParsers[i] = UNITS_PARSER; } if ( bundleDescriptor!=null ) { bundleDescriptor.putProperty( QDataSet.UNITS, i, u ); } } } } /** * initialize the units by guessing at each field. This will * only switch between dimensionless and UTC times. * @param ss the fields. */ private void initializeUnitsByGuessingOld( String[] ss, int lineNumber ) { logger.log(Level.FINE, "guess units at line {0}", lineNumber); for (int i = 0; i < ss.length; i++) { if ( isIso8601Time(ss[i].trim()) ) { _setUnits( i, Units.t2000 ); fieldParsers[i]= UNITS_PARSER; } else { _setUnits( i, Units.dimensionless ); fieldParsers[i] = DOUBLE_PARSER; } } } /** * private a single place where the units array is modified, so that it * is easier to debug. * @param i the column number * @param u the unit */ private void _setUnits( int i, Units u ) { //if ( i==20 ) { // System.err.println("here we are at 2180"); //} this.units[i]= u; } /** * parser uses a regular expression to match each record. */ public static final class RegexParser implements RecordParser { Pattern recordPattern; AsciiParser parser; boolean doGuessUnits= true; public static String[] getNamedGroups( String regex ) { Pattern p= Pattern.compile("\\(\\?\\<([a-zA-Z][0-9a-zA-Z]*)\\>"); Pattern parenPattern= Pattern.compile("\\("); List result= new ArrayList<>(); Matcher m= parenPattern.matcher(regex); while ( m.find() ) { if ( m.start()==0 || regex.charAt(m.start()-1)!='\\' ) { Matcher nm= p.matcher(regex.substring(m.start())); if ( nm.find() && nm.start()==0 ) { String name= nm.group(1); result.add(name); } else { result.add(""); } } else { logger.finer("it wasn't actually a group, it was backslash paren"); } } return result.toArray( new String[result.size()] ); } public RegexParser( AsciiParser parser, String regex) { recordPattern = Pattern.compile(regex); this.parser= parser; parser.initializeByFieldCount(recordPattern.matcher("").groupCount()); String[] gg= getNamedGroups(regex); for ( int i=0; i0 && i 0) { System.err.println("error(s) parsing record number " + (irec) + ": "); System.err.println(line); char[] lineMarker = new char[columnOffsets[fieldCount - 1] + columnWidths[fieldCount - 1]]; for (int i = 0; i < fieldCount; i++) { if (fails[i]) { for (int j = 0; j < columnWidths[i]; j++) { lineMarker[j + columnOffsets[i]] = '-'; } } } System.err.println(new String(lineMarker)); } // the record is parsable if there are two or more parsable fields. // it is not parsable if no fields can be parsed. return ( failCount < tryCount ) && ( okayCount > 1 || failCount < 3 ); } @Override public int fieldCount(String line) { return line.split("\\s*").length; } public String[] fields(String line) { String[] result = new String[fieldCount]; for (int i = 0; i < fieldCount; i++) { result[i] = line.substring(columnOffsets[i], columnOffsets[i] + columnWidths[i]); } return result; } @Override public boolean splitRecord(String line, String[] fields) { if ( line.length() >= columnOffsets[fieldCount-1] + columnWidths[fieldCount-1] ) { for (int i = 0; i < fieldCount; i++) { fields[i] = line.substring(columnOffsets[i], columnOffsets[i] + columnWidths[i]); } return true; } else { return false; } } } /** * return the number of fields in each record. Note the RecordParsers * also have a fieldCount, which should be equal to this. This allows them * to be independent of the parser. * @return */ public int getFieldCount() { return fieldCount; } /** * return the name of each field. field0, field1, ... are the default names when * names are not discovered in the table. Changing the array will not affect * internal representation. * @return */ public String[] getFieldNames() { if ( this.fieldNames==null ) { throw new IllegalArgumentException("unable to identify fields"); } else { return Arrays.copyOf( this.fieldNames, this.fieldNames.length ); } } /** * return the labels found for each field. If a label wasn't found, * then the name is returned. * @return */ public String[] getFieldLabels() { if ( fieldLabels==null ) { fieldLabels= new String[fieldNames.length]; } for ( int i=0; iindex. */ public Units getUnits(int index) { if ( this.units[index]==Units.dimensionless && this.fieldUnits[index]!=null && this.fieldUnits[index].length()>0 ) { return Units.lookupUnits( this.fieldUnits[index] ); } else { return this.units[index]; } } /** * Indexed setter for property units. This now sets the field parser for * the field to be a UNITS_PARSER if it is the default DOUBLE_PARSER. * @param index Index of the property. * @param units New value of the property at index. */ public void setUnits( int index, Units units) { _setUnits( index, units ); if ( fieldParsers[index]==DOUBLE_PARSER ) setFieldParser(index,UNITS_PARSER); if ( fieldParsers[index]==ENUMERATION_PARSER ) { setFieldParser(index,UNITS_PARSER); } propertyChangeSupport.firePropertyChange("units", null, null); } /** * Set all the units at once. This now sets the field parser for * each field to be a UNITS_PARSER if it is the default DOUBLE_PARSER. * @param u array (or varargs) of units to be applied to the 0,1,2nd,... fields. */ public void setUnits( Units ... u ) { System.arraycopy(u, 0, this.units, 0, u.length); for ( int i=0; i=fieldCount ) { throw new IllegalArgumentException("bad column parameter: the record parser only expects "+fieldCount+" columns"); } return icol; } /** * Holds value of property fillValue. */ private double fillValue = -1e31; /** * return the fillValue. numbers that parse to this value are considered * to be fill. Note validMin and validMax may be used as well. * @return Value of property fillValue. */ public double getFillValue() { return this.fillValue; } /** * numbers that parse to this value are considered to be fill. * @param fillValue New value of property fillValue. */ public void setFillValue(double fillValue) { double oldFillValue = this.fillValue; this.fillValue = fillValue; propertyChangeSupport.firePropertyChange("fillValue", oldFillValue, fillValue); } protected double validMin = Double.NEGATIVE_INFINITY; public static final String PROP_VALIDMIN = "validMin"; /** * get the minimum valid value for any field. * @return validMin */ public double getValidMin() { return validMin; } /** * set the minimum valid value for any field. Values less than * this are to be considered invalid. * @param validMin */ public void setValidMin(double validMin) { double oldValidMin = this.validMin; this.validMin = validMin; propertyChangeSupport.firePropertyChange(PROP_VALIDMIN, oldValidMin, validMin); } protected double validMax = Double.POSITIVE_INFINITY; public static final String PROP_VALIDMAX = "validMax"; /** * get the maximum value for any field. * @return the validMax */ public double getValidMax() { return validMax; } /** * set the maximum value for any field. Values above this are to be * considered invalid. * @param validMax */ public void setValidMax(double validMax) { double oldValidMax = this.validMax; this.validMax = validMax; propertyChangeSupport.firePropertyChange(PROP_VALIDMAX, oldValidMax, validMax); } }