------------------------------------------------------------ revno: 13428 [merge] revision-id: kinkie@squid-cache.org-20140601162130-qtcw5mehtnzr1eaz parent: squid3@treenet.co.nz-20140601123803-797lii07lhzsp5kz parent: kinkie@squid-cache.org-20140601141737-7200b93in3i1g3jh committer: Francesco Chemolli branch nick: trunk timestamp: Sun 2014-06-01 18:21:30 +0200 message: Merge: SBuf-based Tokenizer ------------------------------------------------------------ Use --include-merges or -n0 to see merged revisions. ------------------------------------------------------------ # Bazaar merge directive format 2 (Bazaar 0.90) # revision_id: kinkie@squid-cache.org-20140601162130-qtcw5mehtnzr1eaz # target_branch: http://bzr.squid-cache.org/bzr/squid3/trunk/ # testament_sha1: ecc95267b47ead1679da3480e385ff58b4a6284d # timestamp: 2014-06-01 16:53:49 +0000 # source_branch: http://bzr.squid-cache.org/bzr/squid3/trunk/ # base_revision_id: squid3@treenet.co.nz-20140601123803-\ # 797lii07lhzsp5kz # # Begin patch === modified file 'configure.ac' --- configure.ac 2014-05-11 19:12:35 +0000 +++ configure.ac 2014-05-27 08:23:05 +0000 @@ -3467,6 +3467,7 @@ src/ipc/Makefile src/ssl/Makefile src/mgr/Makefile + src/parser/Makefile src/snmp/Makefile contrib/Makefile icons/Makefile === modified file 'src/Makefile.am' --- src/Makefile.am 2014-04-30 10:50:09 +0000 +++ src/Makefile.am 2014-05-27 08:23:05 +0000 @@ -46,8 +46,8 @@ LoadableModules.h \ LoadableModules.cc -SUBDIRS = base anyp comm eui acl format fs repl -DIST_SUBDIRS = base anyp comm eui acl format fs repl +SUBDIRS = base anyp parser comm eui acl format fs repl +DIST_SUBDIRS = base anyp parser comm eui acl format fs repl if ENABLE_AUTH SUBDIRS += auth @@ -647,6 +647,7 @@ $(ADAPTATION_LIBS) \ $(ESI_LIBS) \ $(SNMP_LIBS) \ + parser/libsquid-parser.la \ $(top_builddir)/lib/libmisccontainers.la \ $(top_builddir)/lib/libmiscencoding.la \ $(top_builddir)/lib/libmiscutil.la \ === added directory 'src/parser' === added file 'src/parser/Makefile.am' --- src/parser/Makefile.am 1970-01-01 00:00:00 +0000 +++ src/parser/Makefile.am 2013-12-31 10:42:11 +0000 @@ -0,0 +1,49 @@ +include $(top_srcdir)/src/Common.am +include $(top_srcdir)/src/TestHeaders.am + +EXTRA_PROGRAMS = \ + testTokenizer + +check_PROGRAMS += testTokenizer +TESTS += testTokenizer + +noinst_LTLIBRARIES = libsquid-parser.la + +libsquid_parser_la_SOURCES = \ + Tokenizer.h \ + Tokenizer.cc + +SBUF_SOURCE= \ + $(top_srcdir)/src/base/CharacterSet.h \ + $(top_srcdir)/src/SBuf.h \ + $(top_srcdir)/src/SBuf.cc \ + $(top_srcdir)/src/MemBlob.h \ + $(top_srcdir)/src/MemBlob.cc \ + $(top_srcdir)/src/OutOfBoundsException.h \ + $(top_srcdir)/src/SBufExceptions.h \ + $(top_srcdir)/src/SBufExceptions.cc \ + $(top_srcdir)/src/String.cc \ + $(top_srcdir)/src/SquidString.h \ + $(top_srcdir)/src/base/TextException.h \ + $(top_srcdir)/src/base/TextException.cc + +testTokenizer_SOURCES = \ + $(SBUF_SOURCE) \ + testTokenizer.h \ + testTokenizer.cc \ + Tokenizer.h +nodist_testTokenizer_SOURCES = \ + $(top_srcdir)/src/tests/testMain.cc \ + $(top_srcdir)/src/tests/stub_mem.cc \ + $(top_srcdir)/src/tests/stub_debug.cc \ + $(top_srcdir)/src/tests/stub_time.cc \ + $(top_srcdir)/src/tests/stub_SBufDetailedStats.cc +testTokenizer_LDFLAGS = $(LIBADD_DL) +testTokenizer_LDADD = \ + libsquid-parser.la \ + $(top_builddir)/lib/libmiscutil.la \ + $(top_builddir)/src/base/libbase.la \ + $(SQUID_CPPUNIT_LIBS) \ + $(SQUID_CPPUNIT_LA) \ + $(COMPAT_LIB) +testTokenizer_DEPENDENCIES = $(SQUID_CPPUNIT_LA) === added file 'src/parser/Tokenizer.cc' --- src/parser/Tokenizer.cc 1970-01-01 00:00:00 +0000 +++ src/parser/Tokenizer.cc 2014-05-30 12:41:24 +0000 @@ -0,0 +1,134 @@ +#include "squid.h" +#include "parser/Tokenizer.h" + +bool +Parser::Tokenizer::token(SBuf &returnedToken, const CharacterSet &delimiters) +{ + SBuf savebuf(buf_); + skip(delimiters); + SBuf::size_type tokenLen = buf_.findFirstOf(delimiters); // not found = npos => consume to end + if (tokenLen == SBuf::npos && !delimiters['\0']) { + // no delimiter found, nor is NUL/EOS/npos acceptible as one + buf_ = savebuf; + return false; + } + SBuf retval = buf_.consume(tokenLen); + skip(delimiters); + returnedToken = retval; + return true; +} + +bool +Parser::Tokenizer::prefix(SBuf &returnedToken, const CharacterSet &tokenChars, const SBuf::size_type limit) +{ + SBuf::size_type prefixLen = buf_.substr(0,limit).findFirstNotOf(tokenChars); + if (prefixLen == 0) + return false; + returnedToken = buf_.consume(prefixLen); + return true; +} + +bool +Parser::Tokenizer::skip(const CharacterSet &tokenChars) +{ + SBuf::size_type prefixLen = buf_.findFirstNotOf(tokenChars); + if (prefixLen == 0) + return false; + buf_.consume(prefixLen); + return true; +} + +bool +Parser::Tokenizer::skip(const SBuf &tokenToSkip) +{ + if (buf_.startsWith(tokenToSkip)) { + buf_.consume(tokenToSkip.length()); + return true; + } + return false; +} + +bool +Parser::Tokenizer::skip(const char tokenChar) +{ + if (buf_[0] == tokenChar) { + buf_.consume(1); + return true; + } + return false; +} + +/* reworked from compat/strtoll.c */ +bool +Parser::Tokenizer::int64(int64_t & result, int base) +{ + if (buf_.isEmpty()) + return false; + + //fixme: account for buf_.size() + bool neg = false; + const char *s = buf_.rawContent(); + const char *end = buf_.rawContent() + buf_.length(); + + if (*s == '-') { + neg = true; + ++s; + } else if (*s == '+') { + ++s; + } + if (s >= end) return false; + if (( base == 0 || base == 16) && *s == '0' && (s+1 <= end ) && + tolower(*(s+1)) == 'x') { + s += 2; + base = 16; + } + if (base == 0) { + if ( *s == '0') { + base = 8; + ++s; + } else { + base = 10; + } + } + if (s >= end) return false; + + uint64_t cutoff; + + cutoff = neg ? -static_cast(INT64_MIN) : INT64_MAX; + int cutlim = cutoff % static_cast(base); + cutoff /= static_cast(base); + + int any = 0, c; + int64_t acc = 0; + for (c = *s++; s <= end; c = *s++) { + if (xisdigit(c)) { + c -= '0'; + } else if (xisalpha(c)) { + c -= xisupper(c) ? 'A' - 10 : 'a' - 10; + } else { + break; + } + if (c >= base) + break; + if (any < 0 || static_cast(acc) > cutoff || (static_cast(acc) == cutoff && c > cutlim)) + any = -1; + else { + any = 1; + acc *= base; + acc += c; + } + } + + if (any == 0) // nothing was parsed + return false; + if (any < 0) { + acc = neg ? INT64_MIN : INT64_MAX; + errno = ERANGE; + return false; + } else if (neg) + acc = -acc; + + result = acc; + buf_.consume(s - buf_.rawContent() -1); + return true; +} === added file 'src/parser/Tokenizer.h' --- src/parser/Tokenizer.h 1970-01-01 00:00:00 +0000 +++ src/parser/Tokenizer.h 2014-06-01 13:53:17 +0000 @@ -0,0 +1,95 @@ +#ifndef SQUID_PARSER_TOKENIZER_H_ +#define SQUID_PARSER_TOKENIZER_H_ + +#include "base/CharacterSet.h" +#include "SBuf.h" + +/// Generic protocol-agnostic parsing tools +namespace Parser { + +/** + * Lexical processor to tokenize a buffer. + * + * Allows arbitrary delimiters and token character sets to + * be provided by callers. + * + * All methods start from the beginning of the input buffer. + * Methods returning true consume bytes from the buffer. + * Methods returning false have no side-effects. + */ +class Tokenizer { +public: + explicit Tokenizer(const SBuf &inBuf) : buf_(inBuf) {} + + // return a copy the current contents of the parse buffer + const SBuf buf() const { return buf_; } + + /// whether the end of the buffer has been reached + bool atEnd() const { return buf_.isEmpty(); } + + /// the remaining unprocessed section of buffer + const SBuf& remaining() const { return buf_; } + + /// reinitialize processing for a new buffer + void reset(const SBuf &newBuf) { buf_ = newBuf; } + + /** Basic strtok(3): + * Skips all leading delimiters (if any), + * accumulates all characters up to the next delimiter (a token), and + * skips all trailing delimiters. + * + * Want to extract delimiters? Use prefix() instead. + * + * At least one terminating delimiter is required. \0 may be passed + * as a delimiter to treat end of buffer content as the end of token. + * + * \return false if no terminal delimiter is found. + */ + bool token(SBuf &returnedToken, const CharacterSet &delimiters); + + /** Accumulates all sequential permitted characters up to an optional length limit. + * + * \retval true one or more characters were found, the sequence (string) is placed in returnedToken + * \retval false no characters from the permitted set were found + */ + bool prefix(SBuf &returnedToken, const CharacterSet &tokenChars, SBuf::size_type limit = SBuf::npos); + + /** skips all sequential characters from the set, in any order + * + * \return whether one or more characters in the set were found + */ + bool skip(const CharacterSet &tokenChars); + + /** skips a given character sequence (string) + * + * \return whether the exact character sequence was found and skipped + */ + bool skip(const SBuf &tokenToSkip); + + /** skips a given single character + * + * \return whether the character was found and skipped + */ + bool skip(const char tokenChar); + + /** parse an unsigned int64_t at the beginning of the buffer + * + * strtoll(3)-alike function: tries to parse unsigned 64-bit integer + * at the beginning of the parse buffer, in the base specified by the user + * or guesstimated; consumes the parsed characters. + * + * \param result Output value. Not touched if parsing is unsuccessful. + * \param base Specify base to do the parsing in, with the same restrictions + * as strtoll. Defaults to 0 (meaning guess) + * + * \return whether the parsing was successful + */ + bool int64(int64_t &result, int base = 0); + +private: + SBuf buf_; ///< yet unparsed input +}; + +} /* namespace Parser */ + +#endif /* SQUID_PARSER_TOKENIZER_H_ */ === added file 'src/parser/testTokenizer.cc' --- src/parser/testTokenizer.cc 1970-01-01 00:00:00 +0000 +++ src/parser/testTokenizer.cc 2014-05-30 09:42:45 +0000 @@ -0,0 +1,218 @@ +#include "squid.h" +#include "base/CharacterSet.h" +#include "parser/Tokenizer.h" +#include "testTokenizer.h" + +CPPUNIT_TEST_SUITE_REGISTRATION( testTokenizer ); + +SBuf text("GET http://resource.com/path HTTP/1.1\r\n" + "Host: resource.com\r\n" + "Cookie: laijkpk3422r j1noin \r\n" + "\r\n"); +const CharacterSet alpha("alpha","abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"); +const CharacterSet whitespace("whitespace"," \r\n"); +const CharacterSet crlf("crlf","\r\n"); +const CharacterSet tab("tab","\t"); +const CharacterSet numbers("numbers","0123456789"); + +void +testTokenizer::testTokenizerPrefix() +{ + Parser::Tokenizer t(text); + SBuf s; + + // successful prefix tokenization + CPPUNIT_ASSERT(t.prefix(s,alpha)); + CPPUNIT_ASSERT_EQUAL(SBuf("GET"),s); + CPPUNIT_ASSERT(t.prefix(s,whitespace)); + CPPUNIT_ASSERT_EQUAL(SBuf(" "),s); + + //no match (first char is not in the prefix set) + CPPUNIT_ASSERT(!t.prefix(s,whitespace)); + CPPUNIT_ASSERT_EQUAL(SBuf(" "),s); + + // one more match to set S to something meaningful + CPPUNIT_ASSERT(t.prefix(s,alpha)); + CPPUNIT_ASSERT_EQUAL(SBuf("http"),s); + + //no match (no characters from the character set in the prefix) + CPPUNIT_ASSERT(!t.prefix(s,tab)); + CPPUNIT_ASSERT_EQUAL(SBuf("http"),s); //output SBuf left untouched + + // match until the end of the sample + CharacterSet all(whitespace); + all += alpha; + all += crlf; + all += numbers; + all.add(':').add('.').add('/'); + CPPUNIT_ASSERT(t.prefix(s,all)); + CPPUNIT_ASSERT_EQUAL(SBuf(),t.remaining()); +} + +void +testTokenizer::testTokenizerSkip() +{ + Parser::Tokenizer t(text); + SBuf s; + + // first scenario: patterns match + // prep for test + CPPUNIT_ASSERT(t.prefix(s,alpha)); + CPPUNIT_ASSERT_EQUAL(SBuf("GET"),s); + + // test skip testing character set + CPPUNIT_ASSERT(t.skip(whitespace)); + // check that skip was right + CPPUNIT_ASSERT(t.prefix(s,alpha)); + CPPUNIT_ASSERT_EQUAL(SBuf("http"),s); + + //check skip prefix + CPPUNIT_ASSERT(t.skip(SBuf("://"))); + // verify + CPPUNIT_ASSERT(t.prefix(s,alpha)); + CPPUNIT_ASSERT_EQUAL(SBuf("resource"),s); + + // no skip + CPPUNIT_ASSERT(!t.skip(alpha)); + CPPUNIT_ASSERT(!t.skip(SBuf("://"))); + CPPUNIT_ASSERT(!t.skip('a')); + +} + +void +testTokenizer::testTokenizerToken() +{ + Parser::Tokenizer t(text); + SBuf s; + + // first scenario: patterns match + CPPUNIT_ASSERT(t.token(s,whitespace)); + CPPUNIT_ASSERT_EQUAL(SBuf("GET"),s); + CPPUNIT_ASSERT(t.token(s,whitespace)); + CPPUNIT_ASSERT_EQUAL(SBuf("http://resource.com/path"),s); + CPPUNIT_ASSERT(t.token(s,whitespace)); + CPPUNIT_ASSERT_EQUAL(SBuf("HTTP/1.1"),s); + CPPUNIT_ASSERT(t.token(s,whitespace)); + CPPUNIT_ASSERT_EQUAL(SBuf("Host:"),s); + +} + +void +testTokenizer::testCharacterSet() +{ + +} + +void +testTokenizer::testTokenizerInt64() +{ + // successful parse in base 10 + { + int64_t rv; + Parser::Tokenizer t(SBuf("1234")); + const int64_t benchmark = 1234; + CPPUNIT_ASSERT(t.int64(rv, 10)); + CPPUNIT_ASSERT_EQUAL(benchmark,rv); + } + + // successful parse, autodetect base + { + int64_t rv; + Parser::Tokenizer t(SBuf("1234")); + const int64_t benchmark = 1234; + CPPUNIT_ASSERT(t.int64(rv)); + CPPUNIT_ASSERT_EQUAL(benchmark,rv); + } + + // successful parse, autodetect base + { + int64_t rv; + Parser::Tokenizer t(SBuf("01234")); + const int64_t benchmark = 01234; + CPPUNIT_ASSERT(t.int64(rv)); + CPPUNIT_ASSERT_EQUAL(benchmark,rv); + } + + // successful parse, autodetect base + { + int64_t rv; + Parser::Tokenizer t(SBuf("0x12f4")); + const int64_t benchmark = 0x12f4; + CPPUNIT_ASSERT(t.int64(rv)); + CPPUNIT_ASSERT_EQUAL(benchmark,rv); + } + + // API mismatch: don't eat leading space + { + int64_t rv; + Parser::Tokenizer t(SBuf(" 1234")); + CPPUNIT_ASSERT(!t.int64(rv)); + } + + // API mismatch: don't eat multiple leading spaces + { + int64_t rv; + Parser::Tokenizer t(SBuf(" 1234")); + CPPUNIT_ASSERT(!t.int64(rv)); + } + + // trailing spaces + { + int64_t rv; + Parser::Tokenizer t(SBuf("1234 foo")); + const int64_t benchmark = 1234; + CPPUNIT_ASSERT(t.int64(rv)); + CPPUNIT_ASSERT_EQUAL(benchmark,rv); + CPPUNIT_ASSERT_EQUAL(SBuf(" foo"), t.buf()); + } + + // trailing nonspaces + { + int64_t rv; + Parser::Tokenizer t(SBuf("1234foo")); + const int64_t benchmark = 1234; + CPPUNIT_ASSERT(t.int64(rv)); + CPPUNIT_ASSERT_EQUAL(benchmark,rv); + CPPUNIT_ASSERT_EQUAL(SBuf("foo"), t.buf()); + } + + // trailing nonspaces + { + int64_t rv; + Parser::Tokenizer t(SBuf("0x1234foo")); + const int64_t benchmark = 0x1234f; + CPPUNIT_ASSERT(t.int64(rv)); + CPPUNIT_ASSERT_EQUAL(benchmark,rv); + CPPUNIT_ASSERT_EQUAL(SBuf("oo"), t.buf()); + } + + // overflow + { + int64_t rv; + Parser::Tokenizer t(SBuf("1029397752385698678762234")); + CPPUNIT_ASSERT(!t.int64(rv)); + } + + // buffered sub-string parsing + { + int64_t rv; + SBuf base("1029397752385698678762234"); + const int64_t benchmark = 22; + Parser::Tokenizer t(base.substr(base.length()-4,2)); + CPPUNIT_ASSERT_EQUAL(SBuf("22"),t.buf()); + CPPUNIT_ASSERT(t.int64(rv)); + CPPUNIT_ASSERT_EQUAL(benchmark,rv); + } + + // base-16, prefix + { + int64_t rv; + SBuf base("deadbeefrow"); + const int64_t benchmark=0xdeadbeef; + Parser::Tokenizer t(base); + CPPUNIT_ASSERT(t.int64(rv,16)); + CPPUNIT_ASSERT_EQUAL(benchmark,rv); + CPPUNIT_ASSERT_EQUAL(SBuf("row"),t.buf()); + + } +} === added file 'src/parser/testTokenizer.h' --- src/parser/testTokenizer.h 1970-01-01 00:00:00 +0000 +++ src/parser/testTokenizer.h 2014-05-26 13:04:01 +0000 @@ -0,0 +1,24 @@ +#ifndef SQUID_TESTTOKENIZER_H_ +#define SQUID_TESTTOKENIZER_H_ + +#include + +class testTokenizer : public CPPUNIT_NS::TestFixture +{ + CPPUNIT_TEST_SUITE( testTokenizer ); + CPPUNIT_TEST ( testCharacterSet ); + CPPUNIT_TEST ( testTokenizerPrefix ); + CPPUNIT_TEST ( testTokenizerSkip ); + CPPUNIT_TEST ( testTokenizerToken ); + CPPUNIT_TEST ( testTokenizerInt64 ); + CPPUNIT_TEST_SUITE_END(); + +protected: + void testTokenizerPrefix(); + void testTokenizerSkip(); + void testTokenizerToken(); + void testCharacterSet(); + void testTokenizerInt64(); +}; + +#endif /* SQUID_TESTTOKENIZER_H_ */