------------------------------------------------------------
revno: 13428 [merge]
revision-id: kinkie@squid-cache.org-20140601162130-qtcw5mehtnzr1eaz
parent: squid3@treenet.co.nz-20140601123803-797lii07lhzsp5kz
parent: kinkie@squid-cache.org-20140601141737-7200b93in3i1g3jh
committer: Francesco Chemolli <kinkie@squid-cache.org>
branch nick: trunk
timestamp: Sun 2014-06-01 18:21:30 +0200
message:
  Merge: SBuf-based Tokenizer
------------------------------------------------------------
Use --include-merges or -n0 to see merged revisions.
------------------------------------------------------------
# Bazaar merge directive format 2 (Bazaar 0.90)
# revision_id: kinkie@squid-cache.org-20140601162130-qtcw5mehtnzr1eaz
# target_branch: http://bzr.squid-cache.org/bzr/squid3/trunk/
# testament_sha1: ecc95267b47ead1679da3480e385ff58b4a6284d
# timestamp: 2014-06-01 16:53:49 +0000
# source_branch: http://bzr.squid-cache.org/bzr/squid3/trunk/
# base_revision_id: squid3@treenet.co.nz-20140601123803-\
#   797lii07lhzsp5kz
# 
# Begin patch
=== modified file 'configure.ac'
--- configure.ac	2014-05-11 19:12:35 +0000
+++ configure.ac	2014-05-27 08:23:05 +0000
@@ -3467,6 +3467,7 @@
 	src/ipc/Makefile
 	src/ssl/Makefile
 	src/mgr/Makefile
+	src/parser/Makefile
 	src/snmp/Makefile
 	contrib/Makefile
 	icons/Makefile

=== modified file 'src/Makefile.am'
--- src/Makefile.am	2014-04-30 10:50:09 +0000
+++ src/Makefile.am	2014-05-27 08:23:05 +0000
@@ -46,8 +46,8 @@
 	LoadableModules.h \
 	LoadableModules.cc
 
-SUBDIRS	= base anyp comm eui acl format fs repl
-DIST_SUBDIRS = base anyp comm eui acl format fs repl
+SUBDIRS	= base anyp parser comm eui acl format fs repl
+DIST_SUBDIRS = base anyp parser comm eui acl format fs repl
 
 if ENABLE_AUTH
 SUBDIRS += auth
@@ -647,6 +647,7 @@
 	$(ADAPTATION_LIBS) \
 	$(ESI_LIBS) \
 	$(SNMP_LIBS) \
+	parser/libsquid-parser.la \
 	$(top_builddir)/lib/libmisccontainers.la \
 	$(top_builddir)/lib/libmiscencoding.la \
 	$(top_builddir)/lib/libmiscutil.la \

=== added directory 'src/parser'
=== added file 'src/parser/Makefile.am'
--- src/parser/Makefile.am	1970-01-01 00:00:00 +0000
+++ src/parser/Makefile.am	2013-12-31 10:42:11 +0000
@@ -0,0 +1,49 @@
+include $(top_srcdir)/src/Common.am
+include $(top_srcdir)/src/TestHeaders.am
+
+EXTRA_PROGRAMS = \
+	testTokenizer
+	
+check_PROGRAMS += testTokenizer
+TESTS += testTokenizer
+
+noinst_LTLIBRARIES = libsquid-parser.la
+
+libsquid_parser_la_SOURCES = \
+	Tokenizer.h \
+	Tokenizer.cc
+
+SBUF_SOURCE= \
+	$(top_srcdir)/src/base/CharacterSet.h \
+	$(top_srcdir)/src/SBuf.h \
+	$(top_srcdir)/src/SBuf.cc \
+	$(top_srcdir)/src/MemBlob.h \
+	$(top_srcdir)/src/MemBlob.cc \
+	$(top_srcdir)/src/OutOfBoundsException.h \
+	$(top_srcdir)/src/SBufExceptions.h \
+	$(top_srcdir)/src/SBufExceptions.cc \
+	$(top_srcdir)/src/String.cc \
+	$(top_srcdir)/src/SquidString.h \
+	$(top_srcdir)/src/base/TextException.h \
+	$(top_srcdir)/src/base/TextException.cc
+
+testTokenizer_SOURCES = \
+	$(SBUF_SOURCE) \
+	testTokenizer.h \
+	testTokenizer.cc \
+	Tokenizer.h
+nodist_testTokenizer_SOURCES = \
+	$(top_srcdir)/src/tests/testMain.cc \
+	$(top_srcdir)/src/tests/stub_mem.cc \
+	$(top_srcdir)/src/tests/stub_debug.cc \
+	$(top_srcdir)/src/tests/stub_time.cc \
+	$(top_srcdir)/src/tests/stub_SBufDetailedStats.cc
+testTokenizer_LDFLAGS = $(LIBADD_DL)
+testTokenizer_LDADD = \
+	libsquid-parser.la \
+	$(top_builddir)/lib/libmiscutil.la \
+	$(top_builddir)/src/base/libbase.la \
+	$(SQUID_CPPUNIT_LIBS) \
+	$(SQUID_CPPUNIT_LA) \
+	$(COMPAT_LIB)
+testTokenizer_DEPENDENCIES = $(SQUID_CPPUNIT_LA)

=== added file 'src/parser/Tokenizer.cc'
--- src/parser/Tokenizer.cc	1970-01-01 00:00:00 +0000
+++ src/parser/Tokenizer.cc	2014-05-30 12:41:24 +0000
@@ -0,0 +1,134 @@
+#include "squid.h"
+#include "parser/Tokenizer.h"
+
+bool
+Parser::Tokenizer::token(SBuf &returnedToken, const CharacterSet &delimiters)
+{
+    SBuf savebuf(buf_);
+    skip(delimiters);
+    SBuf::size_type tokenLen = buf_.findFirstOf(delimiters); // not found = npos => consume to end
+    if (tokenLen == SBuf::npos && !delimiters['\0']) {
+        // no delimiter found, nor is NUL/EOS/npos acceptible as one
+        buf_ = savebuf;
+        return false;
+    }
+    SBuf retval = buf_.consume(tokenLen);
+    skip(delimiters);
+    returnedToken = retval;
+    return true;
+}
+
+bool
+Parser::Tokenizer::prefix(SBuf &returnedToken, const CharacterSet &tokenChars, const SBuf::size_type limit)
+{
+    SBuf::size_type prefixLen = buf_.substr(0,limit).findFirstNotOf(tokenChars);
+    if (prefixLen == 0)
+        return false;
+    returnedToken = buf_.consume(prefixLen);
+    return true;
+}
+
+bool
+Parser::Tokenizer::skip(const CharacterSet &tokenChars)
+{
+    SBuf::size_type prefixLen = buf_.findFirstNotOf(tokenChars);
+    if (prefixLen == 0)
+        return false;
+    buf_.consume(prefixLen);
+    return true;
+}
+
+bool
+Parser::Tokenizer::skip(const SBuf &tokenToSkip)
+{
+    if (buf_.startsWith(tokenToSkip)) {
+        buf_.consume(tokenToSkip.length());
+        return true;
+    }
+    return false;
+}
+
+bool
+Parser::Tokenizer::skip(const char tokenChar)
+{
+    if (buf_[0] == tokenChar) {
+        buf_.consume(1);
+        return true;
+    }
+    return false;
+}
+
+/* reworked from compat/strtoll.c */
+bool
+Parser::Tokenizer::int64(int64_t & result, int base)
+{
+    if (buf_.isEmpty())
+        return false;
+
+    //fixme: account for buf_.size()
+    bool neg = false;
+    const char *s = buf_.rawContent();
+    const char *end = buf_.rawContent() + buf_.length();
+
+    if (*s == '-') {
+        neg = true;
+        ++s;
+    } else if (*s == '+') {
+        ++s;
+    }
+    if (s >= end) return false;
+    if (( base == 0 || base == 16) && *s == '0' && (s+1 <= end ) &&
+                    tolower(*(s+1)) == 'x') {
+        s += 2;
+        base = 16;
+    }
+    if (base == 0) {
+        if ( *s == '0') {
+            base = 8;
+            ++s;
+        } else {
+            base = 10;
+        }
+    }
+    if (s >= end) return false;
+
+    uint64_t cutoff;
+
+    cutoff = neg ? -static_cast<uint64_t>(INT64_MIN) : INT64_MAX;
+    int cutlim = cutoff % static_cast<int64_t>(base);
+    cutoff /= static_cast<uint64_t>(base);
+
+    int any = 0, c;
+    int64_t acc = 0;
+    for (c = *s++; s <= end; c = *s++) {
+        if (xisdigit(c)) {
+            c -= '0';
+        } else if (xisalpha(c)) {
+            c -= xisupper(c) ? 'A' - 10 : 'a' - 10;
+        } else {
+            break;
+        }
+        if (c >= base)
+            break;
+        if (any < 0 || static_cast<uint64_t>(acc) > cutoff || (static_cast<uint64_t>(acc) == cutoff && c > cutlim))
+            any = -1;
+        else {
+            any = 1;
+            acc *= base;
+            acc += c;
+        }
+    }
+
+    if (any == 0) // nothing was parsed
+        return false;
+    if (any < 0) {
+        acc = neg ? INT64_MIN : INT64_MAX;
+        errno = ERANGE;
+        return false;
+    } else if (neg)
+        acc = -acc;
+
+    result = acc;
+    buf_.consume(s - buf_.rawContent() -1);
+    return true;
+}

=== added file 'src/parser/Tokenizer.h'
--- src/parser/Tokenizer.h	1970-01-01 00:00:00 +0000
+++ src/parser/Tokenizer.h	2014-06-01 13:53:17 +0000
@@ -0,0 +1,95 @@
+#ifndef SQUID_PARSER_TOKENIZER_H_
+#define SQUID_PARSER_TOKENIZER_H_
+
+#include "base/CharacterSet.h"
+#include "SBuf.h"
+
+/// Generic protocol-agnostic parsing tools
+namespace Parser {
+
+/**
+ * Lexical processor to tokenize a buffer.
+ *
+ * Allows arbitrary delimiters and token character sets to
+ * be provided by callers.
+ *
+ * All methods start from the beginning of the input buffer.
+ * Methods returning true consume bytes from the buffer.
+ * Methods returning false have no side-effects.
+ */
+class Tokenizer {
+public:
+   explicit Tokenizer(const SBuf &inBuf) : buf_(inBuf) {}
+
+   // return a copy the current contents of the parse buffer
+   const SBuf buf() const { return buf_; }
+
+   /// whether the end of the buffer has been reached
+   bool atEnd() const { return buf_.isEmpty(); }
+
+   /// the remaining unprocessed section of buffer
+   const SBuf& remaining() const { return buf_; }
+
+   /// reinitialize processing for a new buffer
+   void reset(const SBuf &newBuf) { buf_ = newBuf; }
+
+   /** Basic strtok(3):
+    *  Skips all leading delimiters (if any),
+    *  accumulates all characters up to the next delimiter (a token), and
+    *  skips all trailing delimiters.
+    *
+    *  Want to extract delimiters? Use prefix() instead.
+    *
+    * At least one terminating delimiter is required. \0 may be passed
+    * as a delimiter to treat end of buffer content as the end of token.
+    *
+    * \return false if no terminal delimiter is found.
+    */
+   bool token(SBuf &returnedToken, const CharacterSet &delimiters);
+
+   /** Accumulates all sequential permitted characters up to an optional length limit.
+    *
+    * \retval true one or more characters were found, the sequence (string) is placed in returnedToken
+    * \retval false no characters from the permitted set were found
+    */
+   bool prefix(SBuf &returnedToken, const CharacterSet &tokenChars, SBuf::size_type limit = SBuf::npos);
+
+   /** skips all sequential characters from the set, in any order
+    *
+    * \return whether one or more characters in the set were found
+    */
+   bool skip(const CharacterSet &tokenChars);
+
+   /** skips a given character sequence (string)
+    *
+    * \return whether the exact character sequence was found and skipped
+    */
+   bool skip(const SBuf &tokenToSkip);
+
+   /** skips a given single character
+    *
+    * \return whether the character was found and skipped
+    */
+   bool skip(const char tokenChar);
+
+   /** parse an unsigned int64_t at the beginning of the buffer
+    *
+    * strtoll(3)-alike function: tries to parse unsigned 64-bit integer
+    * at the beginning of the parse buffer, in the base specified by the user
+    * or guesstimated; consumes the parsed characters.
+    *
+    * \param result Output value. Not touched if parsing is unsuccessful.
+    * \param base   Specify base to do the parsing in, with the same restrictions
+    *               as strtoll. Defaults to 0 (meaning guess)
+    *
+    * \return whether the parsing was successful
+    */
+   bool int64(int64_t &result, int base = 0);
+
+private:
+   SBuf buf_; ///< yet unparsed input
+};
+
+} /* namespace Parser */
+
+#endif /* SQUID_PARSER_TOKENIZER_H_ */

=== added file 'src/parser/testTokenizer.cc'
--- src/parser/testTokenizer.cc	1970-01-01 00:00:00 +0000
+++ src/parser/testTokenizer.cc	2014-05-30 09:42:45 +0000
@@ -0,0 +1,218 @@
+#include "squid.h"
+#include "base/CharacterSet.h"
+#include "parser/Tokenizer.h"
+#include "testTokenizer.h"
+
+CPPUNIT_TEST_SUITE_REGISTRATION( testTokenizer );
+
+SBuf text("GET http://resource.com/path HTTP/1.1\r\n"
+    "Host: resource.com\r\n"
+    "Cookie: laijkpk3422r j1noin \r\n"
+    "\r\n");
+const CharacterSet alpha("alpha","abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ");
+const CharacterSet whitespace("whitespace"," \r\n");
+const CharacterSet crlf("crlf","\r\n");
+const CharacterSet tab("tab","\t");
+const CharacterSet numbers("numbers","0123456789");
+
+void
+testTokenizer::testTokenizerPrefix()
+{
+    Parser::Tokenizer t(text);
+    SBuf s;
+
+    // successful prefix tokenization
+    CPPUNIT_ASSERT(t.prefix(s,alpha));
+    CPPUNIT_ASSERT_EQUAL(SBuf("GET"),s);
+    CPPUNIT_ASSERT(t.prefix(s,whitespace));
+    CPPUNIT_ASSERT_EQUAL(SBuf(" "),s);
+
+    //no match (first char is not in the prefix set)
+    CPPUNIT_ASSERT(!t.prefix(s,whitespace));
+    CPPUNIT_ASSERT_EQUAL(SBuf(" "),s);
+
+    // one more match to set S to something meaningful
+    CPPUNIT_ASSERT(t.prefix(s,alpha));
+    CPPUNIT_ASSERT_EQUAL(SBuf("http"),s);
+
+    //no match (no characters from the character set in the prefix)
+    CPPUNIT_ASSERT(!t.prefix(s,tab));
+    CPPUNIT_ASSERT_EQUAL(SBuf("http"),s); //output SBuf left untouched
+
+    // match until the end of the sample
+    CharacterSet all(whitespace);
+    all += alpha;
+    all += crlf;
+    all += numbers;
+    all.add(':').add('.').add('/');
+    CPPUNIT_ASSERT(t.prefix(s,all));
+    CPPUNIT_ASSERT_EQUAL(SBuf(),t.remaining());
+}
+
+void
+testTokenizer::testTokenizerSkip()
+{
+    Parser::Tokenizer t(text);
+    SBuf s;
+
+    // first scenario: patterns match
+    // prep for test
+    CPPUNIT_ASSERT(t.prefix(s,alpha));
+    CPPUNIT_ASSERT_EQUAL(SBuf("GET"),s);
+
+    // test skip testing character set
+    CPPUNIT_ASSERT(t.skip(whitespace));
+    // check that skip was right
+    CPPUNIT_ASSERT(t.prefix(s,alpha));
+    CPPUNIT_ASSERT_EQUAL(SBuf("http"),s);
+
+    //check skip prefix
+    CPPUNIT_ASSERT(t.skip(SBuf("://")));
+    // verify
+    CPPUNIT_ASSERT(t.prefix(s,alpha));
+    CPPUNIT_ASSERT_EQUAL(SBuf("resource"),s);
+
+    // no skip
+    CPPUNIT_ASSERT(!t.skip(alpha));
+    CPPUNIT_ASSERT(!t.skip(SBuf("://")));
+    CPPUNIT_ASSERT(!t.skip('a'));
+
+}
+
+void
+testTokenizer::testTokenizerToken()
+{
+    Parser::Tokenizer t(text);
+    SBuf s;
+
+    // first scenario: patterns match
+    CPPUNIT_ASSERT(t.token(s,whitespace));
+    CPPUNIT_ASSERT_EQUAL(SBuf("GET"),s);
+    CPPUNIT_ASSERT(t.token(s,whitespace));
+    CPPUNIT_ASSERT_EQUAL(SBuf("http://resource.com/path"),s);
+    CPPUNIT_ASSERT(t.token(s,whitespace));
+    CPPUNIT_ASSERT_EQUAL(SBuf("HTTP/1.1"),s);
+    CPPUNIT_ASSERT(t.token(s,whitespace));
+    CPPUNIT_ASSERT_EQUAL(SBuf("Host:"),s);
+
+}
+
+void
+testTokenizer::testCharacterSet()
+{
+
+}
+
+void
+testTokenizer::testTokenizerInt64()
+{
+    // successful parse in base 10
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("1234"));
+        const int64_t benchmark = 1234;
+        CPPUNIT_ASSERT(t.int64(rv, 10));
+        CPPUNIT_ASSERT_EQUAL(benchmark,rv);
+    }
+
+    // successful parse, autodetect base
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("1234"));
+        const int64_t benchmark = 1234;
+        CPPUNIT_ASSERT(t.int64(rv));
+        CPPUNIT_ASSERT_EQUAL(benchmark,rv);
+    }
+
+    // successful parse, autodetect base
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("01234"));
+        const int64_t benchmark = 01234;
+        CPPUNIT_ASSERT(t.int64(rv));
+        CPPUNIT_ASSERT_EQUAL(benchmark,rv);
+    }
+
+    // successful parse, autodetect base
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("0x12f4"));
+        const int64_t benchmark = 0x12f4;
+        CPPUNIT_ASSERT(t.int64(rv));
+        CPPUNIT_ASSERT_EQUAL(benchmark,rv);
+    }
+
+    // API mismatch: don't eat leading space
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf(" 1234"));
+        CPPUNIT_ASSERT(!t.int64(rv));
+    }
+
+    // API mismatch: don't eat multiple leading spaces
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("  1234"));
+        CPPUNIT_ASSERT(!t.int64(rv));
+    }
+
+    // trailing spaces
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("1234  foo"));
+        const int64_t benchmark = 1234;
+        CPPUNIT_ASSERT(t.int64(rv));
+        CPPUNIT_ASSERT_EQUAL(benchmark,rv);
+        CPPUNIT_ASSERT_EQUAL(SBuf("  foo"), t.buf());
+    }
+
+    // trailing nonspaces
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("1234foo"));
+        const int64_t benchmark = 1234;
+        CPPUNIT_ASSERT(t.int64(rv));
+        CPPUNIT_ASSERT_EQUAL(benchmark,rv);
+        CPPUNIT_ASSERT_EQUAL(SBuf("foo"), t.buf());
+    }
+
+    // trailing nonspaces
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("0x1234foo"));
+        const int64_t benchmark = 0x1234f;
+        CPPUNIT_ASSERT(t.int64(rv));
+        CPPUNIT_ASSERT_EQUAL(benchmark,rv);
+        CPPUNIT_ASSERT_EQUAL(SBuf("oo"), t.buf());
+    }
+
+    // overflow
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("1029397752385698678762234"));
+        CPPUNIT_ASSERT(!t.int64(rv));
+    }
+
+    // buffered sub-string parsing
+    {
+        int64_t rv;
+        SBuf base("1029397752385698678762234");
+        const int64_t benchmark = 22;
+        Parser::Tokenizer t(base.substr(base.length()-4,2));
+        CPPUNIT_ASSERT_EQUAL(SBuf("22"),t.buf());
+        CPPUNIT_ASSERT(t.int64(rv));
+        CPPUNIT_ASSERT_EQUAL(benchmark,rv);
+    }
+
+    // base-16, prefix
+    {
+        int64_t rv;
+        SBuf base("deadbeefrow");
+        const int64_t benchmark=0xdeadbeef;
+        Parser::Tokenizer t(base);
+        CPPUNIT_ASSERT(t.int64(rv,16));
+        CPPUNIT_ASSERT_EQUAL(benchmark,rv);
+        CPPUNIT_ASSERT_EQUAL(SBuf("row"),t.buf());
+
+    }
+}

=== added file 'src/parser/testTokenizer.h'
--- src/parser/testTokenizer.h	1970-01-01 00:00:00 +0000
+++ src/parser/testTokenizer.h	2014-05-26 13:04:01 +0000
@@ -0,0 +1,24 @@
+#ifndef SQUID_TESTTOKENIZER_H_
+#define SQUID_TESTTOKENIZER_H_
+
+#include <cppunit/extensions/HelperMacros.h>
+
+class testTokenizer : public CPPUNIT_NS::TestFixture
+{
+    CPPUNIT_TEST_SUITE( testTokenizer );
+    CPPUNIT_TEST ( testCharacterSet );
+    CPPUNIT_TEST ( testTokenizerPrefix );
+    CPPUNIT_TEST ( testTokenizerSkip );
+    CPPUNIT_TEST ( testTokenizerToken );
+    CPPUNIT_TEST ( testTokenizerInt64 );
+    CPPUNIT_TEST_SUITE_END();
+
+protected:
+    void testTokenizerPrefix();
+    void testTokenizerSkip();
+    void testTokenizerToken();
+    void testCharacterSet();
+    void testTokenizerInt64();
+};
+
+#endif /* SQUID_TESTTOKENIZER_H_ */