#include #include #include #include #include #include #include #include #include #include #include constexpr char startMark[] = "****RAWSTRINGS****\n"; constexpr char endMark[] = "****END RAWSTRINGS****\n"; std::vector getStrings(char* fname) { constexpr int bufsize = std::max(sizeof(startMark), sizeof(endMark)) + 1; char buf[bufsize]; FILE *f = fopen(fname, "rb"); if (f == NULL) { perror("Couldn't open string file"); exit(-1); } bool gotStart = false; off_t startMarkOff; off_t endMarkOff; char *line = fgets(buf, bufsize, f); while (line) { if (!gotStart) { if (!strcmp(startMark, buf)) { startMarkOff = endMarkOff = ftello(f); gotStart = true; } } else { if (!strcmp(endMark, buf)) { break; } endMarkOff = ftello(f); } line = fgets(buf, bufsize, f); } if (!gotStart) { fprintf(stderr, "No start marker %s in file %s\n", startMark, fname); exit(-1); } if (!line) { fprintf(stderr, "No end marker %s in file %s\n", endMark, fname); exit(-1); } int strsize = endMarkOff - startMarkOff - 1; // -1 skips the last newline char *strbuf = (char *)malloc(strsize); fseeko(f, startMarkOff, SEEK_SET); fread(strbuf, 1, strsize, f); fclose(f); char *s = strbuf; char *endbuf = strbuf + strsize; std::vector result; while (s < endbuf) { size_t len = strlen(s); result.emplace_back(s, len); s += len + 1; } free(strbuf); return result; } typedef seqan::String String; typedef seqan::StringSet StringSet; typedef seqan::Index> TMIndex; //typedef seqan::Index TMIndex; // Seems to be same as Esa //typedef seqan::Index> TMIndex; // no workee //typedef seqan::Index> TMIndex; // slower than WOTD //typedef seqan::Index>> TMIndex; // slower than plain WOTD typedef seqan::Iterator>>::Type DFSIter; int ZStringCost(String str) { int cost = 0; for (auto ch : str) { if ((ch >= 'a' && ch <= 'z') || ch == ' ') { // Alphabet A0 and space. cost++; } else if (ch >= 'A' && ch <= 'Z') { // Alphabet A1 cost += 2; // SHIFT + char. } else if ((ch >= '0' && ch <= '9') || (strchr("\n.,!?_#'\"/\\-:()", ch) != NULL)) { // Alphabet A2 cost += 2; // SHIFT + char. } else { if (ch == '\r') { fprintf(stderr, "Carriage return in text\n"); exit(-1); } cost += 3; // ZSCII escape. } } return cost; } void printAbbr(const String& abbr, int strno, int count, int savings) { printf(" .FSTR FSTR?%d,\"", strno); for (auto ch : abbr) { putc(ch, stdout); if (ch == '"') putc(ch, stdout); } printf("\"\t\t; %dx, saved %d\n", count, savings); } void printWords(int nabbr) { printf("WORDS::\n"); for (int i = 0; i < nabbr; i++) { printf(" FSTR?%d\n", i+1); } printf(" .ENDI\n"); } int main(int argc, char *argv[]) { if (argc < 2) { fprintf(stderr, "Usage: %s \n", argv[0]); exit(-1); } char* fname = argv[1]; auto strings = getStrings(fname); #if 0 int i = 0; for (const auto& s:strings) { printf("%d |%.20s|\n", i++, s.c_str()); } #endif #if 0 for (const auto &s : mySet) { std::cerr << s << std::endl; } #endif printf(" ; Frequent words file for %s\n\n", fname); struct timeval traverseTime; struct timeval occurrenceTime; struct timeval buildSetTime; struct timeval costTime; // part of traverse struct timeval timeMark, timeMark2, delta; timerclear(&traverseTime); timerclear(&occurrenceTime); timerclear(&buildSetTime); timerclear(&costTime); gettimeofday(&timeMark, NULL); StringSet mySet; for (const auto& s:strings) { appendValue(mySet, s); } gettimeofday(&timeMark2, NULL); timersub(&timeMark2, &timeMark, &delta); timeradd(&buildSetTime, &delta, &buildSetTime); for (int abbrno = 0; abbrno < 96; abbrno++) { gettimeofday(&timeMark, NULL); TMIndex myIndex(mySet); DFSIter myIterator(myIndex); DFSIter winIterator(myIndex); int maxSavings = 0; // Do a DFS over the suffix tree, finding the node which produces the best savings. // Savings is calculated as number of Z-characters in the representation, minus 2 // (for the abbreviation reference), times number of occurrences, minus the number // of Z-characters in the rep (for the entry in the abbreviation table ). // // During the traversal, instead of figuring real cost, we figure min and max cost, based // on each character taking from 1 to 3 Z-characters. We make a list of candidates // which _could_ be the best, then we rank them afterwards. This is empirically // about 20% faster than keeping track of real cost during traversal, because // getting the parent edge seems expensive. // The int is the max savings for this candidate. std::vector> candidates; int minMaxSavings = 0; while (!atEnd(myIterator)) { auto count = seqan::countOccurrences(myIterator); int cost = 0; int replength = seqan::repLength(myIterator); if (count > 1) { // formulas are equivalent to ZAPFs // (count -1) * (COST - 2) - 2 int minsavings = (int)count * (replength - 2) -replength; int maxsavings = (int)count * (replength*3 - 2) -replength*3; if (minsavings > minMaxSavings) { // Remove any candidates whose max savings is less than the // min savings of the current candidate. auto toremove = std::partition(candidates.begin(), candidates.end(), [minMaxSavings](const auto& a) { return a.first >= minMaxSavings; }); candidates.erase(toremove, candidates.end()); minMaxSavings = minsavings; } if (maxsavings >= minMaxSavings) { candidates.emplace_back(maxsavings, myIterator); } } ++myIterator; } gettimeofday(&timeMark2, NULL); timersub(&timeMark2, &timeMark, &delta); timeradd(&traverseTime, &delta, &traverseTime); #if 0 std::cerr << "Number of winner candidates at iter " << abbrno << " : " << candidates.size() << std::endl; #endif gettimeofday(&timeMark, NULL); maxSavings = 0; for (const auto& candidate : candidates) { // If our approximate savings is not greater than our current best value, no // point in doing the expensive calculation. if (candidate.first > maxSavings) { const auto& iter = candidate.second; int count = seqan::countOccurrences(iter); int cost = ZStringCost(seqan::representative(iter)); int savings = count * (cost - 2) -cost; if (savings > maxSavings) { winIterator = iter; maxSavings = savings; } } } myIterator = winIterator; gettimeofday(&timeMark2, NULL); timersub(&timeMark2, &timeMark, &delta); timeradd(&costTime, &delta, &costTime); auto rep = seqan::representative(myIterator); auto replength = seqan::repLength(myIterator); auto count = seqan::countOccurrences(myIterator); printAbbr(rep, abbrno+1, count, maxSavings); gettimeofday(&timeMark, NULL); // std::cerr << maxSavings << " : " << count << " : " << rep << std::endl; auto occ= seqan::getOccurrences(myIterator); // seqan::orderOccurrences(occ); // orderOccurrences doesn't work with StringSet pair occurrences, nor does sort. std::vector> stdocc; for (const auto &oc : occ) { stdocc.emplace_back(seqan::getSeqNo(oc), seqan::getSeqOffset(oc)); } std::sort(stdocc.begin(), stdocc.end()); #if 0 for (const auto &oc: stdocc) { std::cerr << oc.first << " " << oc.second << std::endl; } #endif gettimeofday(&timeMark2, NULL); timersub(&timeMark2, &timeMark, &delta); timeradd(&occurrenceTime, &delta, &occurrenceTime); timeMark = timeMark2; auto occiter = stdocc.begin(); StringSet newSet; seqan::Iterator::Type olditer = seqan::begin(mySet); for (int i = 0; olditer != seqan::end(mySet); i++, olditer++) { if (occiter == stdocc.end() || occiter->first > i) { seqan::appendValue(newSet, *olditer); } else { if (occiter->first < i) { fprintf(stderr, "Oops missed an occiter\n"); exit(-1); } int prevpos = 0; while (occiter != stdocc.end() && occiter->first == i) { int nextpos = occiter->second; ++occiter; if (prevpos < nextpos) { seqan::appendValue(newSet, seqan::infix(*olditer, prevpos, nextpos)); } prevpos = nextpos + replength; } // insert last string. auto laststr = seqan::suffix(*olditer, prevpos); if (!seqan::empty(laststr)) { seqan::appendValue(newSet, laststr); } } } std::swap(mySet, newSet); gettimeofday(&timeMark2, NULL); timersub(&timeMark2, &timeMark, &delta); timeradd(&buildSetTime, &delta, &buildSetTime); } printWords(96); #define PRINTTIME(x) fprintf(stderr, #x ": %f\n", (double)x.tv_sec + (double)x.tv_usec/1000000.0) PRINTTIME(traverseTime); PRINTTIME(costTime); PRINTTIME(occurrenceTime); PRINTTIME(buildSetTime); }