001 /**
002 * ========================================
003 * JFreeReport : a free Java report library
004 * ========================================
005 *
006 * Project Info: http://reporting.pentaho.org/
007 *
008 * (C) Copyright 2000-2007, by Object Refinery Limited, Pentaho Corporation and Contributors.
009 *
010 * This library is free software; you can redistribute it and/or modify it under the terms
011 * of the GNU Lesser General Public License as published by the Free Software Foundation;
012 * either version 2.1 of the License, or (at your option) any later version.
013 *
014 * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
015 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
016 * See the GNU Lesser General Public License for more details.
017 *
018 * You should have received a copy of the GNU Lesser General Public License along with this
019 * library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
020 * Boston, MA 02111-1307, USA.
021 *
022 * [Java is a trademark or registered trademark of Sun Microsystems, Inc.
023 * in the United States and other countries.]
024 *
025 * ------------
026 * $Id: CSVTokenizer.java,v 1.9 2007/04/01 18:49:33 taqua Exp $
027 * ------------
028 * (C) Copyright 2000-2005, by Object Refinery Limited.
029 * (C) Copyright 2005-2007, by Pentaho Corporation.
030 */
031 package org.jfree.report.util;
032
033 import java.util.Enumeration;
034 import java.util.NoSuchElementException;
035
036 /**
037 * The csv tokenizer class allows an application to break a Comma Separated Value format
038 * into tokens. The tokenization method is much simpler than the one used by the
039 * <code>StringTokenizer</code> class. The <code>CSVTokenizer</code> methods do not
040 * distinguish among identifiers, numbers, and quoted strings, nor do they recognize and
041 * skip comments.
042 * <p/>
043 * The set of separator (the characters that separate tokens) may be specified either at
044 * creation time or on a per-token basis.
045 * <p/>
046 * An instance of <code>CSVTokenizer</code> behaves in one of two ways, depending on
047 * whether it was created with the <code>returnSeparators</code> flag having the value
048 * <code>true</code> or <code>false</code>: <ul> <li>If the flag is <code>false</code>,
049 * delimiter characters serve to separate tokens. A token is a maximal sequence of
050 * consecutive characters that are not separator. <li>If the flag is <code>true</code>,
051 * delimiter characters are themselves considered to be tokens. A token is thus either one
052 * delimiter character, or a maximal sequence of consecutive characters that are not
053 * separator. </ul><p> A <tt>CSVTokenizer</tt> object internally maintains a current
054 * position within the string to be tokenized. Some operations advance this current
055 * position past the characters processed.<p> A token is returned by taking a substring of
056 * the string that was used to create the <tt>CSVTokenizer</tt> object.
057 * <p/>
058 * The following is one example of the use of the tokenizer. The code:
059 * <blockquote><pre>
060 * CSVTokenizer csvt = new CSVTokenizer("this,is,a,test");
061 * while (csvt.hasMoreTokens()) {
062 * println(csvt.nextToken());
063 * }
064 * </pre></blockquote>
065 * <p/>
066 * prints the following output:
067 * <blockquote><pre>
068 * this
069 * is
070 * a
071 * test
072 * </pre></blockquote>
073 *
074 * @author abupon
075 */
076 public class CSVTokenizer implements Enumeration
077 {
078 /**
079 * The complete record that should be separated into elements.
080 */
081 private String record;
082 /**
083 * The separator.
084 */
085 private String separator;
086 /**
087 * The quoting char.
088 */
089 private String quate;
090
091 /**
092 * the current parsing position.
093 */
094 private int currentIndex;
095
096 private boolean beforeStart;
097
098 /**
099 * A possible separator constant.
100 */
101 public static final String SEPARATOR_COMMA = ",";
102 /**
103 * A possible separator constant.
104 */
105 public static final String SEPARATOR_TAB = "\t";
106 /**
107 * A possible separator constant.
108 */
109 public static final String SEPARATOR_SPACE = " ";
110
111 /**
112 * A possible quote character constant.
113 */
114 public static final String DOUBLE_QUATE = "\"";
115 /**
116 * A possible quote character constant.
117 */
118 public static final String SINGLE_QUATE = "'";
119
120 /**
121 * Constructs a csv tokenizer for the specified string. <code>theSeparator</code>
122 * argument is the separator for separating tokens.
123 * <p/>
124 * If the <code>returnSeparators</code> flag is <code>true</code>, then the separator
125 * string is also returned as tokens. separator is returned as a string. If the flag is
126 * <code>false</code>, the separator string is skipped and only serve as separator
127 * between tokens.
128 *
129 * @param aString a string to be parsed.
130 * @param theSeparator the separator (CSVTokenizer.SEPARATOR_COMMA, CSVTokenizer.TAB,
131 * CSVTokenizer.SPACE, etc.).
132 * @param theQuate the quate (CSVTokenizer.SINGLE_QUATE, CSVTokenizer.DOUBLE_QUATE,
133 * etc.).
134 */
135 public CSVTokenizer (final String aString, final String theSeparator,
136 final String theQuate)
137 {
138 if (aString == null)
139 {
140 throw new NullPointerException("The given string is null");
141 }
142 if (theSeparator == null)
143 {
144 throw new NullPointerException("The given separator is null");
145 }
146 if (theQuate == null)
147 {
148 throw new NullPointerException("The given quate is null");
149 }
150 this.record = aString.trim();
151 this.separator = theSeparator;
152 this.quate = theQuate;
153 this.currentIndex = 0;
154 this.beforeStart = true;
155 }
156
157 /**
158 * Constructs a csv tokenizer for the specified string. The characters in the
159 * <code>theSeparator</code> argument are the separator for separating tokens. Separator
160 * string themselves will not be treated as tokens.
161 *
162 * @param aString a string to be parsed.
163 * @param theSeparator the separator (CSVTokenizer.SEPARATOR_COMMA, CSVTokenizer.TAB,
164 * CSVTokenizer.SPACE, etc.).
165 */
166 public CSVTokenizer (final String aString, final String theSeparator)
167 {
168 this(aString, theSeparator, CSVTokenizer.DOUBLE_QUATE);
169 }
170
171 /**
172 * Constructs a string tokenizer for the specified string. The tokenizer uses the
173 * default separator set, which is <code>CSVTokenizer.SEPARATOR_COMMA</code>. Separator
174 * string themselves will not be treated as tokens.
175 *
176 * @param aString a string to be parsed.
177 */
178 public CSVTokenizer (final String aString)
179 {
180 this(aString, CSVTokenizer.SEPARATOR_COMMA);
181 }
182
183 /**
184 * Tests if there are more tokens available from this tokenizer's string. If this method
185 * returns <tt>true</tt>, then a subsequent call to <tt>nextToken</tt> with no argument
186 * will successfully return a token.
187 *
188 * @return <code>true</code> if and only if there is at least one token in the string
189 * after the current position; <code>false</code> otherwise.
190 */
191 public boolean hasMoreTokens ()
192 {
193 return (this.currentIndex < this.record.length());
194 }
195
196 /**
197 * Returns the next token from this string tokenizer.
198 *
199 * @return the next token from this string tokenizer.
200 *
201 * @throws NoSuchElementException if there are no more tokens in this tokenizer's
202 * string.
203 * @throws IllegalArgumentException if given parameter string format was wrong
204 */
205 public String nextToken ()
206 throws NoSuchElementException, IllegalArgumentException
207 {
208
209 if (!this.hasMoreTokens())
210 {
211 throw new NoSuchElementException();
212 }
213 String token;
214
215 if (beforeStart == false)
216 {
217 currentIndex += this.separator.length();
218 }
219 else
220 {
221 beforeStart = false;
222 }
223
224 if (this.record.startsWith(this.quate, this.currentIndex))
225 {
226 String rec = this.record.substring(this.currentIndex + this.quate.length());
227 token = "";
228 for (; ;)
229 {
230 final int end = rec.indexOf(this.quate);
231 if (end < 0)
232 {
233 throw new IllegalArgumentException("Illegal format");
234 }
235
236 if (!rec.startsWith(this.quate, end + 1))
237 {
238 token += rec.substring(0, end);
239 break;
240 }
241 token = token + rec.substring(0, end + 1);
242 rec = rec.substring(end + this.quate.length() * 2);
243 this.currentIndex++;
244 }
245
246 this.currentIndex += (token.length() + this.quate.length() * 2);
247 }
248 else
249 {
250 final int end = this.record.indexOf(this.separator, this.currentIndex);
251 if (end >= 0)
252 {
253 final int start = this.currentIndex;
254 token = this.record.substring(start, end);
255 this.currentIndex = end;
256 }
257 else
258 {
259 final int start = this.currentIndex;
260 token = this.record.substring(start);
261 this.currentIndex = this.record.length();
262 }
263 }
264
265 return token;
266 }
267
268 /**
269 * Returns the next token in this string tokenizer's string. First, the set of
270 * characters considered to be separator by this <tt>CSVTokenizer</tt> object is changed
271 * to be the characters in the string <tt>separator</tt>. Then the next token in the
272 * string after the current position is returned. The current position is advanced
273 * beyond the recognized token. The new delimiter set remains the default after this
274 * call.
275 *
276 * @param theSeparator the new separator.
277 * @return the next token, after switching to the new delimiter set.
278 *
279 * @throws java.util.NoSuchElementException
280 * if there are no more tokens in this tokenizer's string.
281 */
282 public String nextToken (final String theSeparator)
283 {
284 separator = theSeparator;
285 return nextToken();
286 }
287
288 /**
289 * Returns the same value as the <code>hasMoreTokens</code> method. It exists so that
290 * this class can implement the <code>Enumeration</code> interface.
291 *
292 * @return <code>true</code> if there are more tokens; <code>false</code> otherwise.
293 *
294 * @see java.util.Enumeration
295 * @see org.jfree.report.util.CSVTokenizer#hasMoreTokens()
296 */
297 public boolean hasMoreElements ()
298 {
299 return hasMoreTokens();
300 }
301
302 /**
303 * Returns the same value as the <code>nextToken</code> method, except that its declared
304 * return value is <code>Object</code> rather than <code>String</code>. It exists so
305 * that this class can implement the <code>Enumeration</code> interface.
306 *
307 * @return the next token in the string.
308 *
309 * @throws java.util.NoSuchElementException
310 * if there are no more tokens in this tokenizer's string.
311 * @see java.util.Enumeration
312 * @see org.jfree.report.util.CSVTokenizer#nextToken()
313 */
314 public Object nextElement ()
315 {
316 return nextToken();
317 }
318
319 /**
320 * Calculates the number of times that this tokenizer's <code>nextToken</code> method
321 * can be called before it generates an exception. The current position is not
322 * advanced.
323 *
324 * @return the number of tokens remaining in the string using the current delimiter
325 * set.
326 *
327 * @see org.jfree.report.util.CSVTokenizer#nextToken()
328 */
329 public int countTokens ()
330 {
331 int count = 0;
332
333 final int preserve = this.currentIndex;
334 final boolean preserveStart = this.beforeStart;
335 while (this.hasMoreTokens())
336 {
337 this.nextToken();
338 count++;
339 }
340 this.currentIndex = preserve;
341 this.beforeStart = preserveStart;
342
343 return count;
344 }
345
346 /**
347 * Returns the quate.
348 *
349 * @return char
350 */
351 public String getQuate ()
352 {
353 return this.quate;
354 }
355
356 /**
357 * Sets the quate.
358 *
359 * @param quate The quate to set
360 */
361 public void setQuate (final String quate)
362 {
363 this.quate = quate;
364 }
365 }