//////////////////// // Code for Modeling the probabilities for a Categorical Process // A Bernoulli process has, by definition, only two possible outcomes. // The generalization to N possible outcomes is called a /categorical/ // process aka multinomial process. // // https://en.wikipedia.org/wiki/Categorical_distribution #include using namespace std; #include #include #include #include #include #include void usage() { cout << "Options include:\n" " -of foo.csv # output file name: ensemble of observations\n" " ## filed data usable in e.g. ./ternary-plot.gnumeric\n" " -sf svd.csv # SVD file name\n" " ## filed data usable in e.g. ./bernoulli-process.gnumeric\n" " -ss nnn # sample size [default 1000]\n" " -es nnn # ensemble size [default 10000]\n" " -seed iii # for initializing the RNG\n" "Example:\n" " ./multinomial .50 .50\n" "NPR example:\n" " ./multinomial .10 .10 .10 .10 .10 \\\n" " .07 .06 .05 .04 .03 .02 .01 .01 .01 0 x -ss 679\n" ;;;;;;;;; } // Note: Compile with: // multinomial : multinomial.o // $(CXX) $^ -larmadillo -o $@ string const numeric(".0123456789"); class sampler{ public: int seed; boost::mt19937 rng; boost::random::uniform_real_distribution distro; sampler() : seed(42), rng(seed), distro(0, 1) {} sampler(int const _seed) : seed(_seed), rng(seed), distro(0, 1) {} vector doit(int const sample_size, vector const partition); }; vector sampler::doit(int const sample_size, vector const partition){ vector score(partition.size()); for (int ii = 0; ii < sample_size; ii++) { double level = distro(rng); for (unsigned int ii = 0; ii < score.size(); ii++) { if (level <= partition[ii]) { score[ii]++; break; } } } vector rslt(score.size()); for (unsigned int ii = 0; ii < score.size(); ii++) { rslt[ii] = score[ii] / double(sample_size); } return rslt; } void punt(string const arg){ cerr << "Option '" << arg << "' requires an argument" << endl; exit(1); } // The expression "\"" is just too ugly. // Define a nicer-looking synonym: string const Q = "\""; // Same thing, plus a comma: string const Qc = "\","; #define X(str) (Q + #str + Qc) // FIXME: in texmode, in floating-point numerals, replace "e" with "\e" void dump_matrix(ostream& ouch, arma::mat const &mmm, int const pr, int const texmode=0){ int wd(pr+6); int dim(mmm.n_rows); for (int ii = 0; ii < dim; ii++){ for (int jj = 0; jj < dim; jj++){ if (texmode) { ///xxxxx if ( ((ii/5) % 2) || ((jj/5) % 2) ) if ( mmm(ii, jj) > 3 ) ouch << "\\y{"; else ouch << "\\x{"; ouch << setprecision(pr) << setw(wd) << mmm(ii, jj) << "} "; } else { ouch << setprecision(pr) << setw(wd) << mmm(ii, jj) << ", "; } } if (texmode) { ouch << " \\z" << endl; } else { ouch << endl; } } } void dump_columns(ostream& ouch, arma::mat const &mmm, int const pr, int const texmode=0){ if (!texmode) { dump_matrix(ouch, mmm, pr, texmode); } else { int wd(pr+6); int dim(mmm.n_rows); // first, choose a column: for (int jj = 0; jj < dim; jj++) { ouch << "\\x{\\left["; ouch << "\\begin{array}{c}%%\n"; // dump all rows in this column for (int ii = 0; ii < dim; ii++) { ouch << setprecision(pr) << setw(wd) << mmm(ii, jj) << " \\\\ "; } ouch << "\\end{array}\\right]}\n"; } } } void dump_svd(ostream& ouch, arma::mat const &covariance, arma::vec const &eival, arma::mat const &U, int const pr, int const texmode=0){ ouch << "*********** texmode: " << texmode << endl; int wd(pr+6); int dim(U.n_rows); // display the raw covariance: ouch << "Raw covariance:" << endl; dump_matrix(ouch, covariance, pr); // derive the raw error bars on the diagonal elements: ouch << "Raw stdev (cheapness):" << endl; for (int ii = 0; ii < dim; ii++){ ouch << setprecision(pr) << setw(wd) << sqrt(covariance(ii, ii)) << ", "; } ouch << endl; // display SVD-related stuff: ouch << "Eigenvalues:" << endl; for (int ii = 0; ii < dim; ii++){ if (!texmode) { ouch << setprecision(pr) << setw(wd) << eival(ii) << ", "; } else { ouch << "\\x{" << setprecision(pr) << setw(wd) << eival(ii) << "} "; } } ouch << endl; ouch << "Rotated error bars (cheapness):" << endl; for (int ii = 0; ii < dim; ii++){ ouch << setprecision(pr) << setw(wd) << sqrt(eival(ii)) << ", "; } ouch << endl; ouch << "Eigenvectors:" << endl; dump_columns(ouch, U, pr, texmode); ouch << "Oriented error bars:" << endl; arma::vec orbar(dim); // for (int ii = 0; ii < dim; ii++) orbar(ii) = sqrt(eival(ii)); orbar = sqrt(eival); dump_columns(ouch, U * diagmat(orbar), pr, texmode); } int main(int _argc, char ** _argv){ int texmode(0); string ofn(""); string sfn(""); int pr(4); // precision for printing numbers int rng_seed(42); int verbosity(0); if (verbosity) {} // suppress compiler warning int sample_size(1000); int ensemble_size(10000); // density is the probability density distribution; // partition reflects the cumulative probability distribution: vector density; vector partition; int argc(_argc); char** argv(_argv); string progname(*argv++); argc--; double subttl(0); while (argc) { string arg(*argv++); argc--; if (arg.substr(0,2) == "--") { arg = arg.substr(1); } if (0) {} else if (arg == "-h" || arg == "-help") { usage(); exit(0); } else if (arg == "-of") { if (!argc) punt(arg); ofn = *argv++; argc--; } else if (arg == "-sf") { if (!argc) punt(arg); sfn = *argv++; argc--; } else if (arg == "-ss") { if (!argc) punt(arg); sample_size = atof(*argv++); argc--; } else if (arg == "-es") { if (!argc) punt(arg); ensemble_size = atof(*argv++); argc--; } else if (arg == "-tex") { texmode++; } else if (arg == "-seed") { if (!argc) punt(arg); rng_seed = atoi(*argv++); argc--; } else if (arg == "x" || arg == "X") { density.push_back(1. - subttl); subttl += density.back(); partition.push_back(subttl); } else { // should be numeric if (numeric.find(arg[0]) == string::npos) { cerr << "Arg '" << arg << "' is not numeric, and not a recognized option" << endl; exit(1); } density.push_back(atof(arg.c_str())); subttl += density.back(); partition.push_back(subttl); } } int dim(density.size()); if (dim >= 16) pr = 2; // squeeze to fit on page int wd(pr+6); // open the files: ofstream ouch; if (ofn.length()){ ouch.open(ofn); if (!ouch.good()) { cerr << "Could not open output file '" << ofn << "'" << endl; exit(1); } } ofstream svdch; if (sfn.length()){ svdch.open(sfn); if (!svdch.good()) { cerr << "Could not open SVD file '" << sfn << "'" << endl; exit(1); } } arma::mat est_covariance(dim, dim, arma::fill::zeros); for (int ii = 0; ii < dim; ii++) { cout << setw(wd) << density[ii] << ", "; } cout << endl; for (int ii = 0; ii < dim; ii++) { cout << setw(wd) << partition[ii] << ", "; } cout << endl; // Send some metadata to the file: if (ofn.length()){ ouch << X(sample_size) << X(ensemble_size) << endl; ouch << sample_size << ", " << ensemble_size << ", " << endl; for (int ii = 0; ii < dim; ii++) { ouch << density[ii] << ", "; } ouch << endl; ouch << endl; // blank line ouch << X(Data.........) << endl; } // the big loop over all elements of the ensemble sampler sample(rng_seed); for (int qq = 0; qq < ensemble_size; qq++) { vector normed = sample.doit(sample_size, partition); double ttl(0); for (unsigned int ii = 0; ii < normed.size(); ii++){ ttl += normed[ii]; for (unsigned int jj = 0; jj < normed.size(); jj++){ est_covariance(ii, jj) += (normed[ii] - density[ii]) * (normed[jj] - density[jj]); } } if (ofn.length()) { for (unsigned int ii = 0; ii < normed.size(); ii++){ ouch << normed[ii] << ", "; } ouch << ttl << endl; } } // normalize the est_covariance, so as to represent an ensemble average // (rather than a running subtotal) for (int ii = 0; ii < dim; ii++){ for (int jj = 0; jj < dim; jj++){ est_covariance(ii, jj) = est_covariance(ii, jj) / ensemble_size; } } // construct exact a_priori covariance arma::mat ap_covariance(dim, dim, arma::fill::zeros); for (int ii = 0; ii < dim; ii++){ for (int jj = 0; jj < dim; jj++){ ap_covariance(ii, jj) = - density[ii]*density[jj] / sample_size; } ap_covariance(ii, ii) = density[ii]*(1.-density[ii]) / sample_size; } // now do the singular value decomposition: arma::mat est_U, est_V; arma::vec est_eival; arma::svd(est_U, est_eival, est_V, est_covariance); arma::mat ap_U, ap_V; arma::vec ap_eival; arma::svd(ap_U, ap_eival, ap_V, ap_covariance); // Send the results to the screen: cout << "a_priori..............." << endl; dump_svd(cout, ap_covariance, ap_eival, ap_U, pr, texmode); if (0) { cout << "estimated.............." << endl; dump_svd(cout, est_covariance, est_eival, est_U, pr, texmode); } // and possibly to a file: // File gets higher precision than screen: if (sfn.length()){ svdch << "a_priori..............." << endl; dump_svd(svdch, ap_covariance, ap_eival, ap_U, 15, texmode); svdch << "estimated.............." << endl; dump_svd(svdch, est_covariance, est_eival, est_U, 15, texmode); } // Calculate pairwise moves: arma::vec inv_eival(dim); for (int ii = 0; ii < dim; ii++){ if (abs(ap_eival(ii)) > 1e-10) { inv_eival(ii) = 1.0 / ap_eival(ii); } else { inv_eival(ii) = 0; } } arma::mat ap_Sigma(ap_U*diagmat(inv_eival)*ap_V.t()); // mahad is the Mahalanobis distance, // i.e. how far off-peak the variable is, // measured in units of sigma arma::mat mahad(dim, dim); for (int ii = 0; ii < dim; ii++){ for (int jj = 0; jj < dim; jj++){ if (ii == jj) { mahad(ii, jj) = -0; } else { arma::vec diff_move(dim, arma::fill::zeros); diff_move(ii) = 0.50 * abs(density[ii] - density[jj]); diff_move(jj) = -diff_move(ii); mahad(ii, jj) = sqrt(dot(diff_move, ap_Sigma * diff_move)); } } } cout << "Pairwise mahad:" << endl; dump_matrix(cout, mahad, pr, texmode); if (sfn.length()) { svdch << "Pairwise mahad:" << endl; dump_matrix(svdch, mahad, texmode ? pr : 15, texmode); svdch << "Pairwise mahad bubbles:" << endl; for (int ii = 0; ii < dim; ii++){ for (int jj = 0; jj < dim; jj++){ svdch << 1+ii << ", " << 1+jj << ", " << mahad(ii, jj) << ", " << endl; } } } // tidy up: if (sfn.length()) svdch.close(); if (ofn.length()) ouch.close(); }