CharSeparatedValues.java

/*
 * Copyright (c) 2001-2017, Zoltan Farkas All Rights Reserved.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 *
 * Additionally licensed with:
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.spf4j.io.csv;

import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
import gnu.trove.map.hash.THashMap;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.UncheckedIOException;
import java.io.Writer;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import javax.annotation.CheckReturnValue;
import javax.annotation.ParametersAreNonnullByDefault;
import org.spf4j.base.Arrays;
import org.spf4j.base.CharSequences;
import org.spf4j.io.PushbackReader;

/**
 * Supports Character Separated values format as described at: https://en.wikipedia.org/wiki/Comma-separated_values.
 * either of \n \r or \r\n generalized to custom separator character.
 * are valid end of line delimiters
 *
 * why another implementation? because I need one that is as fast as possible, and as flexible as possible.
 *
 * @author zoly
 */
@ParametersAreNonnullByDefault
@SuppressFBWarnings("NP_LOAD_OF_KNOWN_NULL_VALUE") // FB gets it wrong here
public final class CharSeparatedValues {

  /**
   * http://unicode.org/faq/utf_bom.html#BOM
   */
  public static final int UTF_BOM = '\uFEFF';

  private final char separator;
  private final char[] toEscape;

  public CharSeparatedValues(final char separator) {
    if (separator == '\n' || separator == '\r' || separator == '"') {
      throw new IllegalArgumentException("Illegal separator character " + separator);
    }
    this.separator = separator;
    this.toEscape = new char[]{separator, '\n', '\r', '"'};
  }

  public CharSeparatedValues(final char separator, final char... extraCharsToEscape) {
    if (separator == '\n' || separator == '\r' || separator == '"') {
      throw new IllegalArgumentException("Illegal separator character " + separator);
    }
    this.separator = separator;
    this.toEscape = new char[4 + extraCharsToEscape.length];
    this.toEscape[0] = separator;
    this.toEscape[1] = '\n';
    this.toEscape[2] = '\r';
    this.toEscape[3] = '"';
    System.arraycopy(extraCharsToEscape, 0, this.toEscape, 4, extraCharsToEscape.length);
  }

  public void writeCsvRow(final Appendable writer, final Object... elems) throws IOException {
    writeCsvRowNoEOL(writer, elems);
    writer.append('\n');
  }

  @SafeVarargs
  public final String toCsvRowString(final Object... elems) {
    StringBuilder result = new StringBuilder(elems.length * 8);
    try {
      writeCsvRowNoEOL(result, elems);
    } catch (IOException ex) {
      throw new UncheckedIOException(ex);
    }
    return result.toString();
  }

  public void writeCsvRowNoEOL(final Appendable writer, final Object... elems) throws IOException {
    if (elems.length > 0) {
      int i = 0;
      Object elem = elems[i++];
      if (elem != null) {
        writeCsvElement(elem.toString(), writer);
      }
      while (i < elems.length) {
        writer.append(separator);
        elem = elems[i++];
        if (elem != null) {
          writeCsvElement(elem.toString(), writer);
        }
      }
    }
  }

  public void writeCsvRow2(final Appendable writer, final Object obj, final Object... elems)
          throws IOException {
    if (obj != null) {
      writeCsvElement(obj.toString(), writer);
    }
    for (Object elem : elems) {
      writer.append(separator);
      if (elem != null) {
        writeCsvElement(elem.toString(), writer);
      }
    }
    writer.append('\n');
  }

  public void writeCsvRow(final Appendable writer, final long... elems) throws IOException {
    writeCsvRowNoEOL(elems, writer);
    writer.append('\n');
  }

  public void writeCsvRowNoEOL(final long[] elems, final Appendable writer) throws IOException {
    if (elems.length > 0) {
      int i = 0;
      writer.append(Long.toString(elems[i++]));
      while (i < elems.length) {
        writer.append(separator);
        writer.append(Long.toString(elems[i++]));
      }
    }
  }

  public void writeCsvRow(final Appendable writer, final Iterable<?> elems) throws IOException {
    writeCsvRowNoEOL(elems, writer);
    writer.append('\n');
  }

  public void writeCsvRowNoEOL(final Iterable<?> elems, final Appendable writer) throws IOException {
    Iterator<?> it = elems.iterator();
    if (it.hasNext()) {
      Object next = it.next();
      if (next != null) {
        writeCsvElement(next.toString(), writer);
      }
      while (it.hasNext()) {
        writer.append(separator);
        next = it.next();
        if (next != null) {
          writeCsvElement(next.toString(), writer);
        }
      }
    }
  }

  public <T> T read(final File file, final Charset charset,
          final CsvMapHandler<T> handler) throws IOException, CsvParseException {
    try (BufferedReader br = new BufferedReader(new InputStreamReader(Files.newInputStream(file.toPath()), charset))) {
      return read(br, handler);
    }
  }

  public <T> T read(final File file, final Charset charset,
          final CsvHandler<T> handler) throws IOException, CsvParseException {
    try (BufferedReader br = new BufferedReader(new InputStreamReader(Files.newInputStream(file.toPath()), charset))) {
      return read(br, handler);
    }
  }

  public List<Map<String, String>> read(final Reader preader) throws IOException, CsvParseException {
    return read(preader, new ToListMapHandler());
  }

  public <T> T read(final Reader preader,
          final CsvMapHandler<T> handler) throws IOException, CsvParseException {
    return read(preader, new CsvMapHandler2CsvHandler<>(handler));
  }

  public List<String> readRow(final Reader reader) throws IOException, CsvParseException {
    return readRow(reader, new CsvRow2List());
  }

  public <T> T readRow(final Reader reader, final CsvRowHandler<T> handler) throws IOException, CsvParseException {
    return read(reader, new OneRowHandler<>(handler));
  }

  public <T> T read(final Reader preader,
          final CsvHandler<T> handler) throws IOException, CsvParseException {
    PushbackReader reader = new PushbackReader(preader);
    int firstChar = reader.read();
    if (firstChar != UTF_BOM && firstChar >= 0) {
      reader.unread(firstChar);
    }
    return readNoBom(reader, handler);
  }

  /**
   * reads CSV format until EOF of reader.
   *
   * @param <T>
   * @param preader
   * @param handler
   * @return
   * @throws IOException
   */
  public <T> T readNoBom(final PushbackReader reader, final CsvHandler<T> handler)
          throws IOException, CsvParseException {
    CsvReader r = reader(reader);
    handler.startRow(0);
    CsvReader.TokenType token = r.next();
    while (token != CsvReader.TokenType.END_DOCUMENT) {
      if (token == CsvReader.TokenType.ELEMENT) {
        handler.element(r.getElement());
        token = r.next();
      } else if (token == CsvReader.TokenType.END_ROW) {
        handler.endRow();
        token = r.next();
        if (token == CsvReader.TokenType.ELEMENT) {
          handler.startRow(r.currentLineNumber());
        }
      }
    }
    return handler.eof();
  }

  /**
   * read a CSV stream, as a Iterable over rows.
   * the List<String> instance is reused during iteration, you will need to copy content into
   * own data structure.
   * @param preader
   * @return
   */
  public  Iterable<Iterable<String>> asIterable(final Reader preader) {
    return () -> {
      try {
        return new CsvReader2Iterator(reader(preader));
      } catch (IOException ex) {
        throw new UncheckedIOException(ex);
      }
    };

  }

  /**
   * Iterate through the first row of your CSV.
   * the CharSequence is a re-0used char buffer you either need to parse the content out of copy it.
   * @param preader
   * @return
   */
  public Iterable<CharSequence> singleRow(final Reader preader) {
    try {
      CsvReader reader = reader(preader);
      return () -> new OneRowIterator(reader);
    } catch (IOException ex) {
      throw new UncheckedIOException(ex);
    }
  }

  public CsvReader reader(final Reader preader) throws IOException {
    PushbackReader reader = new PushbackReader(preader);
    int firstChar = reader.read();
    if (firstChar != UTF_BOM && firstChar >= 0) {
      reader.unread(firstChar);
    }
    return readerNoBOMILEL(reader);
  }

  /**
   * will ignore last empty line.
   * @param preader
   * @return
   * @throws IOException
   * @deprecated use reader
   */
  @Deprecated
  public CsvReader readerILEL(final Reader preader) throws IOException {
    return reader(preader);
  }

  /**
   * assumes there is not BOM. (byte order marker)
   * @param reader
   * @return
   */
  public CsvReader readerNoBOM(final PushbackReader reader) {
    return new CsvReaderImpl(reader);
  }

  /**
   * reader that there is not BOM. (byte order marker) and will ignore last empty line.
   * @param reader
   * @return
   * @deprecated use readerNoBOM.
   */
  @Deprecated
  public CsvReader readerNoBOMILEL(final PushbackReader reader) {
    return new CsvReaderImpl(reader);
  }

  public CsvWriter writer(final Writer writer) {
    return new CsvWriterImpl(writer);
  }

  public void writeCsvElement(final CharSequence elem, final Appendable writer) throws IOException {
    if (CharSequences.containsAnyChar(elem, toEscape)) {
      writeQuotedCsvElement(elem, writer);
    } else {
      writer.append(elem);
    }
  }

  public static void writeQuotedCsvElement(final CharSequence elem, final Appendable writer) throws IOException {
    writer.append('"');
    writeQuotedElementContent(elem, 0, elem.length(), writer);
    writer.append('"');
  }

  public static void writeQuotedElementContent(final CharSequence elem,
          final int start, final int end, final Appendable writer) throws IOException {
    for (int i = start; i < end; i++) {
      char c = elem.charAt(i);
      writeQuotedChar(c, writer);
    }
  }

  public static void writeQuotedChar(final char c, final Appendable writer) throws IOException {
    if (c == '"') {
      writer.append("\"\"");
    } else {
      writer.append(c);
    }
  }

   public CharSequence toCsvElement(final CharSequence elem) {
    if (CharSequences.containsAnyChar(elem, toEscape)) {
      StringBuilder sw = new StringBuilder(elem.length() + 4);
      try {
        writeQuotedCsvElement(elem, sw);
      } catch (IOException ex) {
        throw new UncheckedIOException(ex);
      }
      return sw;
    } else {
      return elem;
    }
  }

   public String toCsvElement(final String elem) {
    if (CharSequences.containsAnyChar(elem, toEscape)) {
      StringBuilder sw = new StringBuilder(elem.length() + 4);
      try {
        writeQuotedCsvElement(elem, sw);
      } catch (IOException ex) {
        throw new UncheckedIOException(ex);
      }
      return sw.toString();
    } else {
      return elem;
    }
  }

  /**
   * returns next character.
   *
   * @param reader
   * @param addElemTo
   * @return - next character or -1 if eof has been reached.
   * @throws IOException
   */
  @CheckReturnValue
  public int readCsvElement(final Reader reader, final StringBuilder addElemTo, final long lineNr)
          throws IOException, CsvParseException {
    int c = reader.read();
    if (c < 0) {
      return c;
    }
    if (c == '"') {
      c = reader.read();
      while (c >= 0) {
        if (c == '"') {
          int c2 = reader.read();
          if (c2 >= 0) {
            if (c2 == '"') {
              addElemTo.append((char) c);
            } else {
              return c2;
            }
          } else {
            return c2;
          }
        } else {
          addElemTo.append((char) c);
        }
        c = reader.read();
      }
      throw new CsvParseException("Escaped CSV element " + addElemTo + " not terminated correctly at " + lineNr);
    } else {
      while (c != separator && c != '\n' && c != '\r' && c >= 0) {
        addElemTo.append((char) c);
        c = reader.read();
      }
    }
    return c;
  }

  @Override
  public String toString() {
    return "CharSepValues{" + "separator=" + separator + '}';
  }

  private static class ToListMapHandler implements CsvMapHandler<List<Map<String, String>>> {

    private List<Map<String, String>> result = new ArrayList<>();

    @Override
    public void row(final Map<String, String> row) {
      result.add(row);
    }

    @Override
    public List<Map<String, String>> eof() {
      return result;
    }
  }

  private static class CsvMapHandler2CsvHandler<T> implements CsvHandler<T> {

    private final CsvMapHandler<T> handler;
    private boolean first = true;
    private final List<String> header = new ArrayList<>();
    private int elemIdx;
    private Map<String, String> row = null;
    private long lineNr;


    CsvMapHandler2CsvHandler(final CsvMapHandler<T> handler) {
      this.handler = handler;
    }

    @Override
    public void startRow(final long ln) {
      lineNr = ln;
      elemIdx = 0;
      if (!first) {
        row = new THashMap<>(header.size());
      }
    }

    @Override
    public void element(final CharSequence elem) throws CsvParseException {
      if (first) {
        header.add(elem.toString());
      } else {
        if (header.size() <= elemIdx) {
          throw new CsvParseException("Too many elements in row " + row + " at line " + lineNr);
        }
        row.put(header.get(elemIdx), elem.toString());
      }
      elemIdx++;
    }

    @Override
    public void endRow() {
      if (first) {
        first = false;
      } else {
        handler.row(row);
      }
    }

    @Override
    public T eof() {
      return handler.eof();
    }
  }

 private class CsvReaderImpl implements CsvReader {

    private final PushbackReader reader;
    private final StringBuilder currentElement = new StringBuilder();
    private CsvReader.TokenType currentToken;
    private CsvReader.TokenType nextToken;
    private long lineNr = 0;

    CsvReaderImpl(final PushbackReader reader) {
      this.reader = reader;
      this.currentToken = CsvReader.TokenType.START_DOCUMENT;
      this.nextToken = null;
    }

   @SuppressFBWarnings("SF_SWITCH_FALLTHROUGH")
   private void readNext() throws IOException, CsvParseException {
     // nextToken will always be null;
     switch (currentToken) {
       case END_DOCUMENT:
         nextToken = TokenType.END_DOCUMENT;
         return;
       case END_ROW:
         // handle special case of EOF followed by EOL.
         int peek = reader.read();
         if (peek < 0) {
           currentToken = TokenType.END_DOCUMENT;
           nextToken = TokenType.END_DOCUMENT;
           return;
         }
         reader.unread(peek);
       case START_DOCUMENT:
       case ELEMENT:
         currentElement.setLength(0);
         int next = readCsvElement(reader, currentElement, lineNr);
         currentToken = CsvReader.TokenType.ELEMENT;
         switch (next) {
           case '\r':
             lineNr++;
             nextToken = CsvReader.TokenType.END_ROW;
             int c2 = reader.read();
             if (c2 < 0) {
               return;
             }
             if (c2 != '\n') {
               reader.unread(c2);
             }
             return;
           case '\n':
             lineNr++;
             nextToken = CsvReader.TokenType.END_ROW;
             c2 = reader.read();
             if (c2 < 0) {
               return;
             }
             if (c2 != '\r') {
               reader.unread(c2);
               break;
             }
             break;
           default:
             if (next != separator) {
               if (next < 0) {
                 nextToken = CsvReader.TokenType.END_ROW;
               } else {
                 throw new CsvParseException("Unexpected character " + next + " at line" + lineNr);
               }
             }
         }
         return;
       default:
         throw new IllegalStateException("Invalid current token " + currentToken);

     }

   }

    @Override
    public CsvReader.TokenType next() throws IOException, CsvParseException {
      if (nextToken == null) {
        readNext();
        return currentToken;
      } else {
        CsvReader.TokenType result = nextToken;
        if (result != CsvReader.TokenType.END_DOCUMENT) {
          nextToken = null;
        }
        currentToken = result;
        return result;
      }
    }

    @Override
    public CsvReader.TokenType current() {
      return currentToken;
    }

    @Override
    public CharSequence getElement() {
      if (currentToken != TokenType.ELEMENT) {
        throw new IllegalStateException("No current element, current token is " + currentToken);
      }
      return currentElement;
    }

    @Override
    public long currentLineNumber() {
      return lineNr;
    }

  }

  private static class OneRowHandler<T> implements CsvHandler<T> {

    private final CsvRowHandler<T> handler;


    OneRowHandler(final CsvRowHandler<T> handler) {
      this.handler = handler;
    }

    @Override
    public void startRow(final long rowNr) {
      if (rowNr > 0) {
        throw new IllegalArgumentException("Multiple rows encountered for " + this);
      }
    }

    @Override
    public void element(final CharSequence elem) {
      handler.element(elem);
    }

    @Override
    public T eof() {
      return handler.eof();
    }

  }

  private static final class CsvRow2List implements CsvRowHandler<List<String>> {

    private final List<String> result = new ArrayList<>();

    @Override
    public void element(final CharSequence elem) {
      result.add(elem.toString());
    }

    @Override
    public List<String> eof() {
      return result;
    }
  }

  private class CsvWriterImpl implements CsvWriter {

    private final Writer writer;

    CsvWriterImpl(final Writer writer) {
      this.writer = writer;
    }
    private boolean isStartLine = true;

    @Override
    public void writeElement(final CharSequence cs) throws IOException {
      addComma();
      writeCsvElement(cs, writer);
    }

    private void addComma() throws IOException {
      if (isStartLine) {
        isStartLine = false;
      } else {
        writer.append(separator);
      }
    }

    @Override
    public void writeEol() throws IOException {
      writer.append('\n');
      isStartLine = true;
    }

    @Override
    public void flush() throws IOException {
      writer.flush();
    }


    @Override
    public ElementAppendable startQuotedElement() throws IOException {
      addComma();
      writer.write('"');
      return new ElementAppendable() {
        @Override
        public Appendable append(final CharSequence csq) throws IOException {
          writeQuotedElementContent(csq, 0, csq.length(), writer);
          return this;
        }

        @Override
        public Appendable append(final CharSequence csq, final int start, final int end) throws IOException {
          writeQuotedElementContent(csq, start, end, writer);
          return this;
        }

        @Override
        public Appendable append(final char c) throws IOException {
          writeQuotedChar(c, writer);
          return this;
        }

        @Override
        public void close() throws IOException {
          writer.write('"');
        }
      };
    }

    @Override
    public Appendable startRawElement() throws IOException {
      addComma();
      return new Appendable() {
        @Override
        public Appendable append(final CharSequence csq) throws IOException {
          if (CharSequences.containsAnyChar(csq, toEscape)) {
            throw new IllegalStateException("Attempting to write str containing escapeable seq " + csq);
          }
          writer.append(csq);
          return this;
        }

        @Override
        public Appendable append(final CharSequence csq, final int start, final int end) throws IOException {
          if (CharSequences.containsAnyChar(csq, start, end, toEscape)) {
            throw new IllegalStateException("Attempting to write str containing escapeable seq " + csq);
          }
          writer.append(csq, start, end);
          return this;
        }

        @Override
        public Appendable append(final char c) throws IOException {
          if (Arrays.search(toEscape, c) >= 0) {
            throw new IllegalStateException("Attempting to write str containing escapeable seq " + c);
          }
          writer.append(c);
          return this;
        }
      };
    }
  }

}