boost/xpressive/detail/dynamic/parse_charset.hpp

///////////////////////////////////////////////////////////////////////////////
// parse_charset.hpp
//
//  Copyright 2004 Eric Niebler. Distributed under the Boost
//  Software License, Version 1.0. (See accompanying file
//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)

#ifndef BOOST_XPRESSIVE_DETAIL_DYNAMIC_PARSE_CHARSET_HPP_EAN_10_04_2005
#define BOOST_XPRESSIVE_DETAIL_DYNAMIC_PARSE_CHARSET_HPP_EAN_10_04_2005

// MS compatible compilers support #pragma once
#if defined(_MSC_VER) && (_MSC_VER >= 1020)
# pragma once
#endif

#include <boost/mpl/bool.hpp>
#include <boost/integer.hpp>
#include <boost/numeric/conversion/converter.hpp>
#include <boost/xpressive/detail/detail_fwd.hpp>
#include <boost/xpressive/detail/dynamic/parser_enum.hpp>
#include <boost/xpressive/detail/utility/literals.hpp>
#include <boost/xpressive/detail/utility/chset/chset.hpp>
#include <boost/xpressive/regex_constants.hpp>

namespace boost { namespace xpressive { namespace detail
{

enum escape_type
{
    escape_char
  , escape_mark
  , escape_class
};

///////////////////////////////////////////////////////////////////////////////
// escape_value
//
template<typename Char, typename Class>
struct escape_value
{
    Char ch_;
    int mark_nbr_;
    Class class_;
    escape_type type_;
};

///////////////////////////////////////////////////////////////////////////////
// char_overflow_handler
//
struct char_overflow_handler
{
    void operator ()(numeric::range_check_result result) const // throw(regex_error)
    {
        if(numeric::cInRange != result)
        {
            throw regex_error(regex_constants::error_escape,
                "character escape too large to fit in target character type");
        }
    }
};

///////////////////////////////////////////////////////////////////////////////
// parse_escape
//
template<typename FwdIter, typename CompilerTraits>
escape_value<typename iterator_value<FwdIter>::type, typename CompilerTraits::regex_traits::char_class_type>
parse_escape(FwdIter &begin, FwdIter end, CompilerTraits &traits)
{
    using namespace regex_constants;
    typedef typename iterator_value<FwdIter>::type char_type;
    typedef typename CompilerTraits::regex_traits regex_traits;
    typedef typename regex_traits::char_class_type char_class_type;

    // define an unsigned type the same size as char_type
    typedef typename boost::uint_t<CHAR_BIT * sizeof(char_type)>::least uchar_t;
    BOOST_MPL_ASSERT_RELATION(sizeof(uchar_t), ==, sizeof(char_type));
    typedef numeric::conversion_traits<uchar_t, int> converstion_traits;

    ensure(begin != end, error_escape, "unexpected end of pattern found");
    numeric::converter<int, uchar_t, converstion_traits, char_overflow_handler> converter;
    escape_value<char_type,char_class_type> esc = { 0, 0, 0, escape_char };
    bool const icase = (0 != (regex_constants::icase_ & traits.flags()));
    regex_traits const &rxtraits = traits.traits();
    FwdIter tmp;

    esc.class_ = rxtraits.lookup_classname(begin, begin + 1, icase);
    if(0 != esc.class_)
    {
        esc.type_ = escape_class;
        return esc;
    }

    if(-1 != rxtraits.value(*begin, 8))
    {
        esc.ch_ = converter(toi(begin, end, rxtraits, 8, 0777));
        return esc;
    }

    switch(*begin)
    {
    // bell character
    case BOOST_XPR_CHAR_(char_type, 'a'):
        esc.ch_ = BOOST_XPR_CHAR_(char_type, '\a');
        ++begin;
        break;
    // escape character
    case BOOST_XPR_CHAR_(char_type, 'e'):
        esc.ch_ = converter(27);
        ++begin;
        break;
    // control character
    case BOOST_XPR_CHAR_(char_type, 'c'):
        ensure(++begin != end, error_escape, "unexpected end of pattern found");
        ensure
        (
            rxtraits.in_range(BOOST_XPR_CHAR_(char_type, 'a'), BOOST_XPR_CHAR_(char_type, 'z'), *begin)
         || rxtraits.in_range(BOOST_XPR_CHAR_(char_type, 'A'), BOOST_XPR_CHAR_(char_type, 'Z'), *begin)
          , error_escape
          , "invalid escape control letter; must be one of a-z or A-Z"
        );
        // Convert to character according to ECMA-262, section 15.10.2.10:
        esc.ch_ = converter(*begin % 32);
        ++begin;
        break;
    // formfeed character
    case BOOST_XPR_CHAR_(char_type, 'f'):
        esc.ch_ = BOOST_XPR_CHAR_(char_type, '\f');
        ++begin;
        break;
    // newline
    case BOOST_XPR_CHAR_(char_type, 'n'):
        esc.ch_ = BOOST_XPR_CHAR_(char_type, '\n');
        ++begin;
        break;
    // return
    case BOOST_XPR_CHAR_(char_type, 'r'):
        esc.ch_ = BOOST_XPR_CHAR_(char_type, '\r');
        ++begin;
        break;
    // horizontal tab
    case BOOST_XPR_CHAR_(char_type, 't'):
        esc.ch_ = BOOST_XPR_CHAR_(char_type, '\t');
        ++begin;
        break;
    // vertical tab
    case BOOST_XPR_CHAR_(char_type, 'v'):
        esc.ch_ = BOOST_XPR_CHAR_(char_type, '\v');
        ++begin;
        break;
    // hex escape sequence
    case BOOST_XPR_CHAR_(char_type, 'x'):
        ensure(++begin != end, error_escape, "unexpected end of pattern found");
        tmp = begin;
        esc.ch_ = converter(toi(begin, end, rxtraits, 16, 0xff));
        ensure(2 == std::distance(tmp, begin), error_escape, "invalid hex escape : "
            "must be \\x HexDigit HexDigit");
        break;
    // Unicode escape sequence
    case BOOST_XPR_CHAR_(char_type, 'u'):
        ensure(++begin != end, error_escape, "unexpected end of pattern found");
        tmp = begin;
        esc.ch_ = converter(toi(begin, end, rxtraits, 16, 0xffff));
        ensure(4 == std::distance(tmp, begin), error_escape, "invalid Unicode escape : "
            "must be \\u HexDigit HexDigit HexDigit HexDigit");
        break;
    // backslash
    case BOOST_XPR_CHAR_(char_type, '\\'):
        //esc.ch_ = BOOST_XPR_CHAR_(char_type, '\\');
        //++begin;
        //break;
    // all other escaped characters represent themselves
    default:
        esc.ch_ = *begin;
        ++begin;
        break;
    }

    return esc;
}

//////////////////////////////////////////////////////////////////////////
// parse_charset
//
template<typename FwdIter, typename RegexTraits, typename CompilerTraits>
inline void parse_charset
(
    FwdIter &begin
  , FwdIter end
  , compound_charset<RegexTraits> &chset
  , CompilerTraits &traits
)
{
    using namespace regex_constants;
    typedef typename RegexTraits::char_type char_type;
    typedef typename RegexTraits::char_class_type char_class_type;
    BOOST_ASSERT(begin != end);
    RegexTraits const &rxtraits = traits.traits();
    bool const icase = (0 != (regex_constants::icase_ & traits.flags()));
    FwdIter iprev = FwdIter();
    escape_value<char_type, char_class_type> esc = {0, 0, 0, escape_char};
    bool invert = false;

    // check to see if we have an inverse charset
    if(begin != end && token_charset_invert == traits.get_charset_token(iprev = begin, end))
    {
        begin = iprev;
        invert = true;
    }

    // skip the end token if-and-only-if it is the first token in the charset
    if(begin != end && token_charset_end == traits.get_charset_token(iprev = begin, end))
    {
        for(; begin != iprev; ++begin)
        {
            chset.set_char(*begin, rxtraits, icase);
        }
    }

    compiler_token_type tok;
    char_type ch_prev = char_type(), ch_next = char_type();
    bool have_prev = false;

    ensure(begin != end, error_brack, "unexpected end of pattern found");

    // remember the current position and grab the next token
    iprev = begin;
    tok = traits.get_charset_token(begin, end);
    do
    {
        ensure(begin != end, error_brack, "unexpected end of pattern found");

        if(token_charset_hyphen == tok && have_prev)
        {
            // remember the current position
            FwdIter iprev2 = begin;
            have_prev = false;

            // ch_prev is lower bound of a range
            switch(traits.get_charset_token(begin, end))
            {
            case token_charset_hyphen:
            case token_charset_invert:
                begin = iprev2; // un-get these tokens and fall through
            case token_literal:
                ch_next = *begin++;
                detail::ensure(ch_prev <= ch_next, error_range, "invalid charset range");
                chset.set_range(ch_prev, ch_next, rxtraits, icase);
                continue;
            case token_charset_backspace:
                ch_next = char_type(8); // backspace
                detail::ensure(ch_prev <= ch_next, error_range, "invalid charset range");
                chset.set_range(ch_prev, ch_next, rxtraits, icase);
                continue;
            case token_escape:
                esc = parse_escape(begin, end, traits);
                if(escape_char == esc.type_)
                {
                    detail::ensure(ch_prev <= esc.ch_, error_range, "invalid charset range");
                    chset.set_range(ch_prev, esc.ch_, rxtraits, icase);
                    continue;
                }
            case token_charset_end: // fall through
            default:                // not a range.
                begin = iprev;      // backup to hyphen token
                chset.set_char(ch_prev, rxtraits, icase);
                chset.set_char(*begin++, rxtraits, icase);
                continue;
            }
        }

        if(have_prev)
        {
            chset.set_char(ch_prev, rxtraits, icase);
            have_prev = false;
        }

        switch(tok)
        {
        case token_charset_hyphen:
        case token_charset_invert:
        case token_charset_end:
        case token_posix_charset_end:
            begin = iprev; // un-get these tokens
            ch_prev = *begin++;
            have_prev = true;
            continue;

        case token_charset_backspace:
            ch_prev = char_type(8); // backspace
            have_prev = true;
            continue;

        case token_posix_charset_begin:
            {
                FwdIter tmp = begin, start = begin;
                bool invert = (token_charset_invert == traits.get_charset_token(tmp, end));
                if(invert)
                {
                    begin = start = tmp;
                }
                while(token_literal == (tok = traits.get_charset_token(begin, end)))
                {
                    tmp = ++begin;
                    ensure(begin != end, error_brack, "unexpected end of pattern found");
                }
                if(token_posix_charset_end == tok)
                {
                    char_class_type chclass = rxtraits.lookup_classname(start, tmp, icase);
                    ensure(0 != chclass, error_ctype, "unknown class name");
                    chset.set_class(chclass, invert);
                    continue;
                }
                begin = iprev; // un-get this token
                ch_prev = *begin++;
                have_prev = true;
            }
            continue;

        case token_escape:
            esc = parse_escape(begin, end, traits);
            if(escape_char == esc.type_)
            {
                ch_prev = esc.ch_;
                have_prev = true;
            }
            else if(escape_class == esc.type_)
            {
                char_class_type upper_ = lookup_classname(rxtraits, "upper");
                BOOST_ASSERT(0 != upper_);
                chset.set_class(esc.class_, rxtraits.isctype(*begin++, upper_));
            }
            else
            {
                BOOST_ASSERT(false);
            }
            continue;

        default:
            ch_prev = *begin++;
            have_prev = true;
            continue;
        }
    }
    while(ensure((iprev = begin) != end, error_brack, "unexpected end of pattern found"),
          token_charset_end != (tok = traits.get_charset_token(begin, end)));

    if(have_prev)
    {
        chset.set_char(ch_prev, rxtraits, icase);
    }

    if(invert)
    {
        chset.inverse();
    }
}

}}} // namespace boost::xpressive::detail

#endif
Boost C++ Libraries

boost/xpressive/detail/dynamic/parse_charset.hpp