#include #include #include "csv_parse.h" #include // std::max #include using namespace std; typedef vector row; typedef map MSI; typedef map MSS; typedef map MSX; typedef map MSR; typedef map MSSX; // This class keeps track of everything we // know about a particular input file. class xfile{ public: string fname; string shortname; MSR content; int maxwid; int keycol; row header; xfile() : maxwid(0), keycol(-1) {} }; string S(xtring const xxx){ if (xxx.quoted) return "\"" + xxx.str + "\""; else return xxx.str; } int main(int argc, char * const * argv){ string sep("\t"); sep = ","; int filec = argc-1; char * const * filev = argv+1; vector todo; // one entry per file MSS gcfn; // list of globally consistent field names // Specify the list of fields that are globally consistent: // This really ought to be more easily configurable. gcfn["nick"] = ""; MSI allfn; // all field names seen, # of files in which seen // The abscissas of keylist tell you all the keys that have been seen. // The ordinate is a map, indexed by global field name, of the // value for that field (in the records associated with this key). MSSX keylist; for (int ii = 0; ii < filec; ii++){ todo.push_back(xfile()); vector::iterator foo = todo.begin() + ii; MSI unique_keys; string fname = filev[ii]; foo->fname = foo->shortname = fname; string ext = ".csv"; int where = fname.length()-ext.length(); if (fname.substr(where) == ext) { foo->shortname = fname.substr(0,where); } ifstream in(foo->fname); if (in.fail()){ cerr << "Could not open input file '" << foo->fname << "'" << endl; return 1; } row myrow; string line; for (int jj = 0;; jj++) { getline(in, line); if (!in.good()) break; csvline_populate(myrow, line, ','); if (jj == 0) { // assume headers are in row #0 foo->header = myrow; for (unsigned int col = 0; col < myrow.size(); col++) { string field = myrow[col].str; if (field == "key") { foo->keycol = col; } allfn[field]++; } } else { if (foo->keycol < 0) { cerr << "?? No 'key' column in file " << foo->fname << endl; foo->keycol = 0; } string key = myrow[foo->keycol].str; if (unique_keys.count(key)) { cerr << "?? Non-unique key '" << key << "'" << " file " << foo->fname // Give user a line number starting at 1 (not 0) // even though we count from 0 internally: << " line " << 1+jj << " (dropped)" << endl; } else { // valid unique key unique_keys[key] = 1; foo->content[key] = myrow; foo->maxwid = max(foo->maxwid, int(myrow.size())); for (unsigned int col = 0; col < myrow.size(); col++) { string field = foo->header[col].str; if (gcfn.count(field)) { if (keylist[key].count(field)) { if (keylist[key][field].str != myrow[col].str){ cerr << "Inconsistent global field:"; cerr << "'" << keylist[key][field].str << "'" << " versus " << "'" << myrow[col].str << "'" << " file " << foo->fname // Give user a line number starting at 1 (not 0) // even though we count from 0 internally: << " line " << 1+jj << endl; } } else { keylist[key][field].str = myrow[col].str; keylist[key][field].quoted |= myrow[col].quoted; } } } } } } in.close(); } // Print the header line // The key: cout << "key" << sep; // The globally consistent fields: for (MSS::const_iterator field = gcfn.begin(); field != gcfn.end(); field++) { cout << field->first << sep; } // The other fields, then ones that come from a particular file: for (unsigned int ii = 0; ii < todo.size(); ii++) { vector::iterator foo = todo.begin() + ii; row myrow = foo->header; for (int col = 0; col < foo->maxwid; col++) { // Skip the key field; it has already been taken care of: if (col == foo->keycol) continue; // Skip any globally consistent field: string field = foo->header[col].str; if (gcfn.count(field)) continue; // Finally, the big print: string val("#null"); if (col < int(myrow.size())) val = S(myrow[col]); if (allfn[field] > 1) { // Field name is not unique, so say what file it came from: cout << foo->shortname << "." << val << sep; } else { // Field name is unique; it suffices all by itself: cout << val << sep; } } } // This is the end of the header line: cout << endl; // print the body of the table for (MSSX::const_iterator kk = keylist.begin(); kk != keylist.end(); kk++) { // Print the key: string key = kk->first; cout << key << sep; // Print the globally consistent fields: for (MSX::const_iterator field = kk->second.begin(); field != kk->second.end(); field++) { cout << S(field->second) << sep; } // Print the other fields, the ones that appear in a particular file: for (unsigned int ii = 0; ii < todo.size(); ii++) { vector::iterator foo = todo.begin() + ii; row myrow = foo->content[key]; for (int col = 0; col < foo->maxwid; col++) { // Skip the key field; it has already been taken care of: if (col == foo->keycol) continue; // Skip any globally consisten field: if (gcfn.count(foo->header[col].str)) continue; // Finally, the big print: string val("#null"); if (col < int(myrow.size())) val = S(myrow[col]); cout << val << sep; } } // This ends the record in the output file: cout << endl; } }