View Javadoc
1   /*
2    * Copyright (c) 2001-2017, Zoltan Farkas All Rights Reserved.
3    *
4    * This library is free software; you can redistribute it and/or
5    * modify it under the terms of the GNU Lesser General Public
6    * License as published by the Free Software Foundation; either
7    * version 2.1 of the License, or (at your option) any later version.
8    *
9    * This library is distributed in the hope that it will be useful,
10   * but WITHOUT ANY WARRANTY; without even the implied warranty of
11   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12   * GNU General Public License for more details.
13   *
14   * You should have received a copy of the GNU Lesser General Public
15   * License along with this program; if not, write to the Free Software
16   * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
17   *
18   * Additionally licensed with:
19   *
20   * Licensed under the Apache License, Version 2.0 (the "License");
21   * you may not use this file except in compliance with the License.
22   * You may obtain a copy of the License at
23   *
24   *      http://www.apache.org/licenses/LICENSE-2.0
25   *
26   * Unless required by applicable law or agreed to in writing, software
27   * distributed under the License is distributed on an "AS IS" BASIS,
28   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
29   * See the License for the specific language governing permissions and
30   * limitations under the License.
31   */
32  package org.spf4j.io.csv;
33  
34  import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
35  import gnu.trove.map.hash.THashMap;
36  import java.io.BufferedReader;
37  import java.io.File;
38  import java.io.IOException;
39  import java.io.InputStreamReader;
40  import java.io.Reader;
41  import java.io.UncheckedIOException;
42  import java.io.Writer;
43  import java.nio.charset.Charset;
44  import java.nio.file.Files;
45  import java.util.ArrayList;
46  import java.util.Iterator;
47  import java.util.List;
48  import java.util.Map;
49  import javax.annotation.CheckReturnValue;
50  import javax.annotation.ParametersAreNonnullByDefault;
51  import org.spf4j.base.Arrays;
52  import org.spf4j.base.CharSequences;
53  import org.spf4j.io.PushbackReader;
54  
55  /**
56   * Supports Character Separated values format as described at: https://en.wikipedia.org/wiki/Comma-separated_values.
57   * either of \n \r or \r\n generalized to custom separator character.
58   * are valid end of line delimiters
59   *
60   * why another implementation? because I need one that is as fast as possible, and as flexible as possible.
61   *
62   * @author zoly
63   */
64  @ParametersAreNonnullByDefault
65  @SuppressFBWarnings("NP_LOAD_OF_KNOWN_NULL_VALUE") // FB gets it wrong here
66  public final class CharSeparatedValues {
67  
68    /**
69     * http://unicode.org/faq/utf_bom.html#BOM
70     */
71    public static final int UTF_BOM = '\uFEFF';
72  
73    private final char separator;
74    private final char[] toEscape;
75  
76    public CharSeparatedValues(final char separator) {
77      if (separator == '\n' || separator == '\r' || separator == '"') {
78        throw new IllegalArgumentException("Illegal separator character " + separator);
79      }
80      this.separator = separator;
81      this.toEscape = new char[]{separator, '\n', '\r', '"'};
82    }
83  
84    public CharSeparatedValues(final char separator, final char... extraCharsToEscape) {
85      if (separator == '\n' || separator == '\r' || separator == '"') {
86        throw new IllegalArgumentException("Illegal separator character " + separator);
87      }
88      this.separator = separator;
89      this.toEscape = new char[4 + extraCharsToEscape.length];
90      this.toEscape[0] = separator;
91      this.toEscape[1] = '\n';
92      this.toEscape[2] = '\r';
93      this.toEscape[3] = '"';
94      System.arraycopy(extraCharsToEscape, 0, this.toEscape, 4, extraCharsToEscape.length);
95    }
96  
97    public void writeCsvRow(final Appendable writer, final Object... elems) throws IOException {
98      writeCsvRowNoEOL(writer, elems);
99      writer.append('\n');
100   }
101 
102   @SafeVarargs
103   public final String toCsvRowString(final Object... elems) {
104     StringBuilder result = new StringBuilder(elems.length * 8);
105     try {
106       writeCsvRowNoEOL(result, elems);
107     } catch (IOException ex) {
108       throw new UncheckedIOException(ex);
109     }
110     return result.toString();
111   }
112 
113   public void writeCsvRowNoEOL(final Appendable writer, final Object... elems) throws IOException {
114     if (elems.length > 0) {
115       int i = 0;
116       Object elem = elems[i++];
117       if (elem != null) {
118         writeCsvElement(elem.toString(), writer);
119       }
120       while (i < elems.length) {
121         writer.append(separator);
122         elem = elems[i++];
123         if (elem != null) {
124           writeCsvElement(elem.toString(), writer);
125         }
126       }
127     }
128   }
129 
130   public void writeCsvRow2(final Appendable writer, final Object obj, final Object... elems)
131           throws IOException {
132     if (obj != null) {
133       writeCsvElement(obj.toString(), writer);
134     }
135     for (Object elem : elems) {
136       writer.append(separator);
137       if (elem != null) {
138         writeCsvElement(elem.toString(), writer);
139       }
140     }
141     writer.append('\n');
142   }
143 
144   public void writeCsvRow(final Appendable writer, final long... elems) throws IOException {
145     writeCsvRowNoEOL(elems, writer);
146     writer.append('\n');
147   }
148 
149   public void writeCsvRowNoEOL(final long[] elems, final Appendable writer) throws IOException {
150     if (elems.length > 0) {
151       int i = 0;
152       writer.append(Long.toString(elems[i++]));
153       while (i < elems.length) {
154         writer.append(separator);
155         writer.append(Long.toString(elems[i++]));
156       }
157     }
158   }
159 
160   public void writeCsvRow(final Appendable writer, final Iterable<?> elems) throws IOException {
161     writeCsvRowNoEOL(elems, writer);
162     writer.append('\n');
163   }
164 
165   public void writeCsvRowNoEOL(final Iterable<?> elems, final Appendable writer) throws IOException {
166     Iterator<?> it = elems.iterator();
167     if (it.hasNext()) {
168       Object next = it.next();
169       if (next != null) {
170         writeCsvElement(next.toString(), writer);
171       }
172       while (it.hasNext()) {
173         writer.append(separator);
174         next = it.next();
175         if (next != null) {
176           writeCsvElement(next.toString(), writer);
177         }
178       }
179     }
180   }
181 
182   public <T> T read(final File file, final Charset charset,
183           final CsvMapHandler<T> handler) throws IOException, CsvParseException {
184     try (BufferedReader br = new BufferedReader(new InputStreamReader(Files.newInputStream(file.toPath()), charset))) {
185       return read(br, handler);
186     }
187   }
188 
189   public <T> T read(final File file, final Charset charset,
190           final CsvHandler<T> handler) throws IOException, CsvParseException {
191     try (BufferedReader br = new BufferedReader(new InputStreamReader(Files.newInputStream(file.toPath()), charset))) {
192       return read(br, handler);
193     }
194   }
195 
196   public List<Map<String, String>> read(final Reader preader) throws IOException, CsvParseException {
197     return read(preader, new ToListMapHandler());
198   }
199 
200   public <T> T read(final Reader preader,
201           final CsvMapHandler<T> handler) throws IOException, CsvParseException {
202     return read(preader, new CsvMapHandler2CsvHandler<>(handler));
203   }
204 
205   public List<String> readRow(final Reader reader) throws IOException, CsvParseException {
206     return readRow(reader, new CsvRow2List());
207   }
208 
209   public <T> T readRow(final Reader reader, final CsvRowHandler<T> handler) throws IOException, CsvParseException {
210     return read(reader, new OneRowHandler<>(handler));
211   }
212 
213   public <T> T read(final Reader preader,
214           final CsvHandler<T> handler) throws IOException, CsvParseException {
215     PushbackReader reader = new PushbackReader(preader);
216     int firstChar = reader.read();
217     if (firstChar != UTF_BOM && firstChar >= 0) {
218       reader.unread(firstChar);
219     }
220     return readNoBom(reader, handler);
221   }
222 
223   /**
224    * reads CSV format until EOF of reader.
225    *
226    * @param <T>
227    * @param preader
228    * @param handler
229    * @return
230    * @throws IOException
231    */
232   public <T> T readNoBom(final PushbackReader reader, final CsvHandler<T> handler)
233           throws IOException, CsvParseException {
234     CsvReader r = reader(reader);
235     handler.startRow(0);
236     CsvReader.TokenType token = r.next();
237     while (token != CsvReader.TokenType.END_DOCUMENT) {
238       if (token == CsvReader.TokenType.ELEMENT) {
239         handler.element(r.getElement());
240         token = r.next();
241       } else if (token == CsvReader.TokenType.END_ROW) {
242         handler.endRow();
243         token = r.next();
244         if (token == CsvReader.TokenType.ELEMENT) {
245           handler.startRow(r.currentLineNumber());
246         }
247       }
248     }
249     return handler.eof();
250   }
251 
252   /**
253    * read a CSV stream, as a Iterable over rows.
254    * the List<String> instance is reused during iteration, you will need to copy content into
255    * own data structure.
256    * @param preader
257    * @return
258    */
259   public  Iterable<Iterable<String>> asIterable(final Reader preader) {
260     return () -> {
261       try {
262         return new CsvReader2Iterator(reader(preader));
263       } catch (IOException ex) {
264         throw new UncheckedIOException(ex);
265       }
266     };
267 
268   }
269 
270   /**
271    * Iterate through the first row of your CSV.
272    * the CharSequence is a re-0used char buffer you either need to parse the content out of copy it.
273    * @param preader
274    * @return
275    */
276   public Iterable<CharSequence> singleRow(final Reader preader) {
277     try {
278       CsvReader reader = reader(preader);
279       return () -> new OneRowIterator(reader);
280     } catch (IOException ex) {
281       throw new UncheckedIOException(ex);
282     }
283   }
284 
285   public CsvReader reader(final Reader preader) throws IOException {
286     PushbackReader reader = new PushbackReader(preader);
287     int firstChar = reader.read();
288     if (firstChar != UTF_BOM && firstChar >= 0) {
289       reader.unread(firstChar);
290     }
291     return readerNoBOMILEL(reader);
292   }
293 
294   /**
295    * will ignore last empty line.
296    * @param preader
297    * @return
298    * @throws IOException
299    * @deprecated use reader
300    */
301   @Deprecated
302   public CsvReader readerILEL(final Reader preader) throws IOException {
303     return reader(preader);
304   }
305 
306   /**
307    * assumes there is not BOM. (byte order marker)
308    * @param reader
309    * @return
310    */
311   public CsvReader readerNoBOM(final PushbackReader reader) {
312     return new CsvReaderImpl(reader);
313   }
314 
315   /**
316    * reader that there is not BOM. (byte order marker) and will ignore last empty line.
317    * @param reader
318    * @return
319    * @deprecated use readerNoBOM.
320    */
321   @Deprecated
322   public CsvReader readerNoBOMILEL(final PushbackReader reader) {
323     return new CsvReaderImpl(reader);
324   }
325 
326   public CsvWriter writer(final Writer writer) {
327     return new CsvWriterImpl(writer);
328   }
329 
330   public void writeCsvElement(final CharSequence elem, final Appendable writer) throws IOException {
331     if (CharSequences.containsAnyChar(elem, toEscape)) {
332       writeQuotedCsvElement(elem, writer);
333     } else {
334       writer.append(elem);
335     }
336   }
337 
338   public static void writeQuotedCsvElement(final CharSequence elem, final Appendable writer) throws IOException {
339     writer.append('"');
340     writeQuotedElementContent(elem, 0, elem.length(), writer);
341     writer.append('"');
342   }
343 
344   public static void writeQuotedElementContent(final CharSequence elem,
345           final int start, final int end, final Appendable writer) throws IOException {
346     for (int i = start; i < end; i++) {
347       char c = elem.charAt(i);
348       writeQuotedChar(c, writer);
349     }
350   }
351 
352   public static void writeQuotedChar(final char c, final Appendable writer) throws IOException {
353     if (c == '"') {
354       writer.append("\"\"");
355     } else {
356       writer.append(c);
357     }
358   }
359 
360    public CharSequence toCsvElement(final CharSequence elem) {
361     if (CharSequences.containsAnyChar(elem, toEscape)) {
362       StringBuilder sw = new StringBuilder(elem.length() + 4);
363       try {
364         writeQuotedCsvElement(elem, sw);
365       } catch (IOException ex) {
366         throw new UncheckedIOException(ex);
367       }
368       return sw;
369     } else {
370       return elem;
371     }
372   }
373 
374    public String toCsvElement(final String elem) {
375     if (CharSequences.containsAnyChar(elem, toEscape)) {
376       StringBuilder sw = new StringBuilder(elem.length() + 4);
377       try {
378         writeQuotedCsvElement(elem, sw);
379       } catch (IOException ex) {
380         throw new UncheckedIOException(ex);
381       }
382       return sw.toString();
383     } else {
384       return elem;
385     }
386   }
387 
388   /**
389    * returns next character.
390    *
391    * @param reader
392    * @param addElemTo
393    * @return - next character or -1 if eof has been reached.
394    * @throws IOException
395    */
396   @CheckReturnValue
397   public int readCsvElement(final Reader reader, final StringBuilder addElemTo, final long lineNr)
398           throws IOException, CsvParseException {
399     int c = reader.read();
400     if (c < 0) {
401       return c;
402     }
403     if (c == '"') {
404       c = reader.read();
405       while (c >= 0) {
406         if (c == '"') {
407           int c2 = reader.read();
408           if (c2 >= 0) {
409             if (c2 == '"') {
410               addElemTo.append((char) c);
411             } else {
412               return c2;
413             }
414           } else {
415             return c2;
416           }
417         } else {
418           addElemTo.append((char) c);
419         }
420         c = reader.read();
421       }
422       throw new CsvParseException("Escaped CSV element " + addElemTo + " not terminated correctly at " + lineNr);
423     } else {
424       while (c != separator && c != '\n' && c != '\r' && c >= 0) {
425         addElemTo.append((char) c);
426         c = reader.read();
427       }
428     }
429     return c;
430   }
431 
432   @Override
433   public String toString() {
434     return "CharSepValues{" + "separator=" + separator + '}';
435   }
436 
437   private static class ToListMapHandler implements CsvMapHandler<List<Map<String, String>>> {
438 
439     private List<Map<String, String>> result = new ArrayList<>();
440 
441     @Override
442     public void row(final Map<String, String> row) {
443       result.add(row);
444     }
445 
446     @Override
447     public List<Map<String, String>> eof() {
448       return result;
449     }
450   }
451 
452   private static class CsvMapHandler2CsvHandler<T> implements CsvHandler<T> {
453 
454     private final CsvMapHandler<T> handler;
455     private boolean first = true;
456     private final List<String> header = new ArrayList<>();
457     private int elemIdx;
458     private Map<String, String> row = null;
459     private long lineNr;
460 
461 
462     CsvMapHandler2CsvHandler(final CsvMapHandler<T> handler) {
463       this.handler = handler;
464     }
465 
466     @Override
467     public void startRow(final long ln) {
468       lineNr = ln;
469       elemIdx = 0;
470       if (!first) {
471         row = new THashMap<>(header.size());
472       }
473     }
474 
475     @Override
476     public void element(final CharSequence elem) throws CsvParseException {
477       if (first) {
478         header.add(elem.toString());
479       } else {
480         if (header.size() <= elemIdx) {
481           throw new CsvParseException("Too many elements in row " + row + " at line " + lineNr);
482         }
483         row.put(header.get(elemIdx), elem.toString());
484       }
485       elemIdx++;
486     }
487 
488     @Override
489     public void endRow() {
490       if (first) {
491         first = false;
492       } else {
493         handler.row(row);
494       }
495     }
496 
497     @Override
498     public T eof() {
499       return handler.eof();
500     }
501   }
502 
503  private class CsvReaderImpl implements CsvReader {
504 
505     private final PushbackReader reader;
506     private final StringBuilder currentElement = new StringBuilder();
507     private CsvReader.TokenType currentToken;
508     private CsvReader.TokenType nextToken;
509     private long lineNr = 0;
510 
511     CsvReaderImpl(final PushbackReader reader) {
512       this.reader = reader;
513       this.currentToken = CsvReader.TokenType.START_DOCUMENT;
514       this.nextToken = null;
515     }
516 
517    @SuppressFBWarnings("SF_SWITCH_FALLTHROUGH")
518    private void readNext() throws IOException, CsvParseException {
519      // nextToken will always be null;
520      switch (currentToken) {
521        case END_DOCUMENT:
522          nextToken = TokenType.END_DOCUMENT;
523          return;
524        case END_ROW:
525          // handle special case of EOF followed by EOL.
526          int peek = reader.read();
527          if (peek < 0) {
528            currentToken = TokenType.END_DOCUMENT;
529            nextToken = TokenType.END_DOCUMENT;
530            return;
531          }
532          reader.unread(peek);
533        case START_DOCUMENT:
534        case ELEMENT:
535          currentElement.setLength(0);
536          int next = readCsvElement(reader, currentElement, lineNr);
537          currentToken = CsvReader.TokenType.ELEMENT;
538          switch (next) {
539            case '\r':
540              lineNr++;
541              nextToken = CsvReader.TokenType.END_ROW;
542              int c2 = reader.read();
543              if (c2 < 0) {
544                return;
545              }
546              if (c2 != '\n') {
547                reader.unread(c2);
548              }
549              return;
550            case '\n':
551              lineNr++;
552              nextToken = CsvReader.TokenType.END_ROW;
553              c2 = reader.read();
554              if (c2 < 0) {
555                return;
556              }
557              if (c2 != '\r') {
558                reader.unread(c2);
559                break;
560              }
561              break;
562            default:
563              if (next != separator) {
564                if (next < 0) {
565                  nextToken = CsvReader.TokenType.END_ROW;
566                } else {
567                  throw new CsvParseException("Unexpected character " + next + " at line" + lineNr);
568                }
569              }
570          }
571          return;
572        default:
573          throw new IllegalStateException("Invalid current token " + currentToken);
574 
575      }
576 
577    }
578 
579     @Override
580     public CsvReader.TokenType next() throws IOException, CsvParseException {
581       if (nextToken == null) {
582         readNext();
583         return currentToken;
584       } else {
585         CsvReader.TokenType result = nextToken;
586         if (result != CsvReader.TokenType.END_DOCUMENT) {
587           nextToken = null;
588         }
589         currentToken = result;
590         return result;
591       }
592     }
593 
594     @Override
595     public CsvReader.TokenType current() {
596       return currentToken;
597     }
598 
599     @Override
600     public CharSequence getElement() {
601       if (currentToken != TokenType.ELEMENT) {
602         throw new IllegalStateException("No current element, current token is " + currentToken);
603       }
604       return currentElement;
605     }
606 
607     @Override
608     public long currentLineNumber() {
609       return lineNr;
610     }
611 
612   }
613 
614   private static class OneRowHandler<T> implements CsvHandler<T> {
615 
616     private final CsvRowHandler<T> handler;
617 
618 
619     OneRowHandler(final CsvRowHandler<T> handler) {
620       this.handler = handler;
621     }
622 
623     @Override
624     public void startRow(final long rowNr) {
625       if (rowNr > 0) {
626         throw new IllegalArgumentException("Multiple rows encountered for " + this);
627       }
628     }
629 
630     @Override
631     public void element(final CharSequence elem) {
632       handler.element(elem);
633     }
634 
635     @Override
636     public T eof() {
637       return handler.eof();
638     }
639 
640   }
641 
642   private static final class CsvRow2List implements CsvRowHandler<List<String>> {
643 
644     private final List<String> result = new ArrayList<>();
645 
646     @Override
647     public void element(final CharSequence elem) {
648       result.add(elem.toString());
649     }
650 
651     @Override
652     public List<String> eof() {
653       return result;
654     }
655   }
656 
657   private class CsvWriterImpl implements CsvWriter {
658 
659     private final Writer writer;
660 
661     CsvWriterImpl(final Writer writer) {
662       this.writer = writer;
663     }
664     private boolean isStartLine = true;
665 
666     @Override
667     public void writeElement(final CharSequence cs) throws IOException {
668       addComma();
669       writeCsvElement(cs, writer);
670     }
671 
672     private void addComma() throws IOException {
673       if (isStartLine) {
674         isStartLine = false;
675       } else {
676         writer.append(separator);
677       }
678     }
679 
680     @Override
681     public void writeEol() throws IOException {
682       writer.append('\n');
683       isStartLine = true;
684     }
685 
686     @Override
687     public void flush() throws IOException {
688       writer.flush();
689     }
690 
691 
692     @Override
693     public ElementAppendable startQuotedElement() throws IOException {
694       addComma();
695       writer.write('"');
696       return new ElementAppendable() {
697         @Override
698         public Appendable append(final CharSequence csq) throws IOException {
699           writeQuotedElementContent(csq, 0, csq.length(), writer);
700           return this;
701         }
702 
703         @Override
704         public Appendable append(final CharSequence csq, final int start, final int end) throws IOException {
705           writeQuotedElementContent(csq, start, end, writer);
706           return this;
707         }
708 
709         @Override
710         public Appendable append(final char c) throws IOException {
711           writeQuotedChar(c, writer);
712           return this;
713         }
714 
715         @Override
716         public void close() throws IOException {
717           writer.write('"');
718         }
719       };
720     }
721 
722     @Override
723     public Appendable startRawElement() throws IOException {
724       addComma();
725       return new Appendable() {
726         @Override
727         public Appendable append(final CharSequence csq) throws IOException {
728           if (CharSequences.containsAnyChar(csq, toEscape)) {
729             throw new IllegalStateException("Attempting to write str containing escapeable seq " + csq);
730           }
731           writer.append(csq);
732           return this;
733         }
734 
735         @Override
736         public Appendable append(final CharSequence csq, final int start, final int end) throws IOException {
737           if (CharSequences.containsAnyChar(csq, start, end, toEscape)) {
738             throw new IllegalStateException("Attempting to write str containing escapeable seq " + csq);
739           }
740           writer.append(csq, start, end);
741           return this;
742         }
743 
744         @Override
745         public Appendable append(final char c) throws IOException {
746           if (Arrays.search(toEscape, c) >= 0) {
747             throw new IllegalStateException("Attempting to write str containing escapeable seq " + c);
748           }
749           writer.append(c);
750           return this;
751         }
752       };
753     }
754   }
755 
756 }