From f44433b04cf079fac0300a33798d62a3754af7d3 Mon Sep 17 00:00:00 2001 From: Spencer T Brody Date: Thu, 30 Jun 2011 17:22:27 -0400 Subject: [PATCH] Part 1 of SERVER-1097: Make mongoimport adhere to CSV spec. --- jstests/tool/csvimport1.js | 40 +++++++ jstests/tool/data/csvimport1.csv | 8 ++ tools/import.cpp | 186 ++++++++++++++++++------------- 3 files changed, 157 insertions(+), 77 deletions(-) create mode 100644 jstests/tool/csvimport1.js create mode 100644 jstests/tool/data/csvimport1.csv diff --git a/jstests/tool/csvimport1.js b/jstests/tool/csvimport1.js new file mode 100644 index 00000000000..3bff1110cbe --- /dev/null +++ b/jstests/tool/csvimport1.js @@ -0,0 +1,40 @@ +// csvimport1.js + +t = new ToolTest( "csvimport1" ) + +c = t.startDB( "foo" ); + +base = [] +base.push({ a : 1, b : "this is some text.\nThis text spans multiple lines, and just for fun\ncontains a comma", "c" : "This has leading and trailing whitespace!" }) +base.push({a : 2, b : "When someone says something you \"put it in quotes\"", "c" : "I like embedded quotes/slashes\\backslashes" }) +base.push({a : 3, b : " This line contains the empty string and has leading and trailing whitespace inside the quotes! ", "c" : "" }) +base.push({a : 4, b : "", "c" : "How are empty entries handled?" }) +base.push({a : 5, b : "\"\"", c : "\"This string is in quotes and contains empty quotes (\"\")\""}) +base.push({ a : "a" , b : "b" , c : "c"}) + +assert.eq( 0 , c.count() , "setup" ); + +t.runTool( "import" , "--file" , "jstests/tool/data/csvimport1.csv" , "-d" , t.baseName , "-c" , "foo" , "--type" , "csv" , "-f" , "a,b,c" ); +assert.soon( base.length + " == c.count()" , "after import 1 " ); + +a = c.find().sort( { a : 1 } ).toArray(); +for (i = 0; i < base.length; i++ ) { + delete a[i]._id + assert.eq( tojson(base[i]), tojson(a[i]), "csv parse " + i) +} + +c.drop() +assert.eq( 0 , c.count() , "after drop" ) + +t.runTool( "import" , "--file" , "jstests/tool/data/csvimport1.csv" , "-d" , t.baseName , "-c" , "foo" , "--type" , "csv" , "--headerline" ) +assert.soon( "c.findOne()" , "no data after sleep" ); +assert.eq( base.length - 1 , c.count() , "after import 2" ); + +x = c.find().sort( { a : 1 } ).toArray(); +for (i = 0; i < base.length - 1; i++ ) { + delete x[i]._id + assert.eq( tojson(base[i]), tojson(x[i]), "csv parse with headerline " + i) +} + + +t.stop() diff --git a/jstests/tool/data/csvimport1.csv b/jstests/tool/data/csvimport1.csv new file mode 100644 index 00000000000..256d40a9184 --- /dev/null +++ b/jstests/tool/data/csvimport1.csv @@ -0,0 +1,8 @@ +a,b,c +1,"this is some text. +This text spans multiple lines, and just for fun +contains a comma", "This has leading and trailing whitespace!" +2, "When someone says something you ""put it in quotes""", I like embedded quotes/slashes\backslashes + 3 , " This line contains the empty string and has leading and trailing whitespace inside the quotes! ", "" + "4" ,, How are empty entries handled? +"5","""""", """This string is in quotes and contains empty quotes ("""")""" diff --git a/tools/import.cpp b/tools/import.cpp index f631ab46dc4..3410b8eef15 100644 --- a/tools/import.cpp +++ b/tools/import.cpp @@ -27,6 +27,7 @@ #include #include +#include using namespace mongo; @@ -46,21 +47,66 @@ class Import : public Tool { vector _upsertFields; static const int BUF_SIZE = 1024 * 1024 * 4; - string stripLeadingWhitespace(string str) { - int i = 0; - while (isspace(str[i])) { ++i; }; // Finds index of first non-whitespace character - return str.substr(i, str.size() - i); + string trimWhitespace(const string& str) { + int begin = 0; + int end = str.size() - 1; + while (isspace(str[begin])) { ++begin; } // Finds index of first non-whitespace character + while (isspace(str[end])) { --end; } // Finds index of last non-whitespace character + return str.substr(begin, end - begin + 1); + } + + void csvTokenizeRow(const string& row, vector& tokens) { + bool inQuotes = false; + bool prevWasQuote = false; + bool tokenQuoted = false; + string curtoken = ""; + for (string::const_iterator it = row.begin(); it != row.end(); ++it) { + char element = *it; + if (element == '"') { + if (!inQuotes) { + inQuotes = true; + tokenQuoted = true; + curtoken = ""; + } else { + if (prevWasQuote) { + curtoken += "\""; + prevWasQuote = false; + } else { + prevWasQuote = true; + } + } + } else { + if (inQuotes && prevWasQuote) { + inQuotes = false; + prevWasQuote = false; + tokens.push_back(curtoken); + } + + if (element == ',' && !inQuotes) { + if (!tokenQuoted) { // If token was quoted, it's already been added + tokens.push_back(trimWhitespace(curtoken)); + } + curtoken = ""; + tokenQuoted = false; + } else { + curtoken += element; + } + } + } + if (!tokenQuoted || (inQuotes && prevWasQuote)) { + tokens.push_back(trimWhitespace(curtoken)); + } } void _append( BSONObjBuilder& b , const string& fieldName , const string& data ) { - if ( b.appendAsNumber( fieldName , data ) ) - return; - if ( _ignoreBlanks && data.size() == 0 ) return; + if ( b.appendAsNumber( fieldName , data ) ) + return; + // TODO: other types? - b.append( fieldName , data ); + b.append ( fieldName , data ); } /* @@ -118,88 +164,75 @@ class Import : public Tool { * Returns a true if a BSONObj was successfully created and false if not. */ bool parseRow(istream* in, BSONObj& o, int& numBytesRead) { - boost::scoped_array line(new char[BUF_SIZE+2]); - char* buf = line.get(); + boost::scoped_array buffer(new char[BUF_SIZE+2]); + char* line = buffer.get(); - numBytesRead = getLine(in, buf); - buf += numBytesRead; + numBytesRead = getLine(in, line); + line += numBytesRead; - while ((_type != TSV || buf[0] != '\t') && isspace( buf[0] )) { - numBytesRead++; - buf++; - } - if (buf[0] == '\0') { + if (line[0] == '\0') { return false; } - numBytesRead += strlen( buf ); + numBytesRead += strlen( line ); if (_type == JSON) { // Strip out trailing whitespace - char * end = ( buf + strlen( buf ) ) - 1; + char * end = ( line + strlen( line ) ) - 1; while ( isspace(*end) ) { *end = 0; end--; } - o = fromjson( buf ); + o = fromjson( line ); return true; } - BSONObjBuilder b; vector tokens; + if (_type == CSV) { + string row; + bool inside_quotes = false; + size_t last_quote = 0; + while (true) { + string lineStr(line); + // Deal with line breaks in quoted strings + last_quote = lineStr.find_first_of('"'); + while (last_quote != string::npos) { + inside_quotes = !inside_quotes; + last_quote = lineStr.find_first_of('"', last_quote+1); + } - unsigned int pos=0; - while ( buf[0] ) { - bool done = false; - string data; - char * end; - if ( _type == CSV && buf[0] == '"' ) { - buf++; //skip first '"' + row.append(lineStr); - while (true) { - end = strchr( buf , '"' ); - if (!end) { - data += buf; - done = true; - break; - } - else if (end[1] == '"') { - // two '"'s get appended as one - data.append(buf, end-buf+1); //include '"' - buf = end+2; //skip both '"'s - } - else if (end[-1] == '\\') { - // "\\\"" gets appended as '"' - data.append(buf, end-buf-1); //exclude '\\' - data.append("\""); - buf = end+1; //skip the '"' - } - else { - data.append(buf, end-buf); - buf = end+2; //skip '"' and ',' - break; - } + if (inside_quotes) { + row.append("\n"); + int num = getLine(in, line); + line += num; + numBytesRead += num; + + uassert (15854, "CSV file ends while inside quoted field", line[0] != '\0'); + numBytesRead += strlen( line ); + } else { + break; } } - else { - end = strstr( buf , _sep ); - if ( ! end ) { - done = true; - data = string( buf ); - } - else { - data = string( buf , end - buf ); - buf = end+1; - } + // now 'row' is string corresponding to one row of the CSV file + // (which may span multiple lines) and represents one BSONObj + csvTokenizeRow(row, tokens); + } + else { // _type == TSV + while (line[0] != '\t' && isspace(line[0])) { // Strip leading whitespace, but not tabs + line++; } - tokens.push_back(data); - if ( done ) - break; + boost::split(tokens, line, boost::is_any_of(_sep)); } - for (vector::iterator token = tokens.begin(); token != tokens.end(); ++token) { + // Now that the row is tokenized, create a BSONObj out of it. + BSONObjBuilder b; + unsigned int pos=0; + for (vector::iterator it = tokens.begin(); it != tokens.end(); ++it) { + string token = *it; if ( _headerLine ) { - _fields.push_back(stripLeadingWhitespace(*token)); + _fields.push_back(token); } else { string name; @@ -213,10 +246,9 @@ class Import : public Tool { } pos++; - _append( b , name , *token ); + _append( b , name , token ); } } - o = b.obj(); return true; } @@ -342,26 +374,26 @@ public: int num = 0; int errors = 0; int len = 0; - // buf and line are only used when parsing a jsonArray - boost::scoped_array line(new char[BUF_SIZE+2]); - char* buf = line.get(); + // buffer and line are only used when parsing a jsonArray + boost::scoped_array buffer(new char[BUF_SIZE+2]); + char* line = buffer.get(); while ( _jsonArray || in->rdstate() == 0 ) { try { BSONObj o; if (_jsonArray) { int bytesProcessed = 0; - if (buf == line.get()) { // Only read on first pass - the whole array must be on one line. - bytesProcessed = getLine(in, buf); - buf += bytesProcessed; + if (line == buffer.get()) { // Only read on first pass - the whole array must be on one line. + bytesProcessed = getLine(in, line); + line += bytesProcessed; len += bytesProcessed; } - if ((bytesProcessed = parseJSONArray(buf, o)) < 0) { + if ((bytesProcessed = parseJSONArray(line, o)) < 0) { len += bytesProcessed; break; } len += bytesProcessed; - buf += len; + line += len; } else { if (!parseRow(in, o, len)) { @@ -398,7 +430,7 @@ public: } catch ( std::exception& e ) { cout << "exception:" << e.what() << endl; - cout << buf << endl; + cout << line << endl; errors++; if (hasParam("stopOnError") || _jsonArray)