Boost C++ Libraries

...one of the most highly regarded and expertly designed C++ library projects in the world. Herb Sutter and Andrei Alexandrescu, C++ Coding Standards

This is the documentation for an old version of Boost. Click here to view this page for the latest version.

boost/spirit/home/lex/lexer/lexertl/lexer.hpp

//  Copyright (c) 2001-2011 Hartmut Kaiser
//
//  Distributed under the Boost Software License, Version 1.0. (See accompanying
//  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)

#if !defined(BOOST_SPIRIT_LEX_LEXER_MAR_17_2007_0139PM)
#define BOOST_SPIRIT_LEX_LEXER_MAR_17_2007_0139PM

#if defined(_MSC_VER)
#pragma once
#endif

#include <iosfwd>

#include <boost/spirit/home/support/detail/lexer/generator.hpp>
#include <boost/spirit/home/support/detail/lexer/rules.hpp>
#include <boost/spirit/home/support/detail/lexer/consts.hpp>
#include <boost/spirit/home/support/unused.hpp>

#include <boost/spirit/home/lex/lexer/lexertl/token.hpp>
#include <boost/spirit/home/lex/lexer/lexertl/functor.hpp>
#include <boost/spirit/home/lex/lexer/lexertl/functor_data.hpp>
#include <boost/spirit/home/lex/lexer/lexertl/iterator.hpp>
#if defined(BOOST_SPIRIT_LEXERTL_DEBUG)
#include <boost/spirit/home/support/detail/lexer/debug.hpp>
#endif

#include <boost/foreach.hpp>

namespace boost { namespace spirit { namespace lex { namespace lexertl
{
    ///////////////////////////////////////////////////////////////////////////
    namespace detail
    {
        ///////////////////////////////////////////////////////////////////////
        //  The must_escape function checks if the given character value needs
        //  to be preceded by a backslash character to disable its special
        //  meaning in the context of a regular expression
        ///////////////////////////////////////////////////////////////////////
        template <typename Char>
        inline bool must_escape(Char c)
        {
            // FIXME: more needed?
            switch (c) {
            case '+': case '/': case '*': case '?':
            case '|':
            case '(': case ')':
            case '[': case ']':
            case '{': case '}':
            case '.':
            case '^': case '$':
            case '\\':
            case '"':
                return true;

            default:
                break;
            }
            return false;
        }

        ///////////////////////////////////////////////////////////////////////
        //  The escape function returns the string representation of the given
        //  character value, possibly escaped with a backslash character, to
        //  allow it being safely used in a regular expression definition.
        ///////////////////////////////////////////////////////////////////////
        template <typename Char>
        inline std::basic_string<Char> escape(Char ch)
        {
            std::basic_string<Char> result(1, ch);
            if (detail::must_escape(ch))
            {
                typedef typename std::basic_string<Char>::size_type size_type;
                result.insert((size_type)0, 1, '\\');
            }
            return result;
        }

        ///////////////////////////////////////////////////////////////////////
        //
        ///////////////////////////////////////////////////////////////////////
        inline boost::lexer::regex_flags map_flags(unsigned int flags)
        {
            unsigned int retval = boost::lexer::none;
            if (flags & match_flags::match_not_dot_newline)
                retval |= boost::lexer::dot_not_newline;
            if (flags & match_flags::match_icase)
                retval |= boost::lexer::icase;

            return boost::lexer::regex_flags(retval);
        }
    }

    ///////////////////////////////////////////////////////////////////////////
    template <typename Lexer, typename F>
    bool generate_static(Lexer const&
      , std::basic_ostream<typename Lexer::char_type>&
      , typename Lexer::char_type const*, F);

    ///////////////////////////////////////////////////////////////////////////
    //
    //  Every lexer type to be used as a lexer for Spirit has to conform to
    //  the following public interface:
    //
    //    typedefs:
    //        iterator_type   The type of the iterator exposed by this lexer.
    //        token_type      The type of the tokens returned from the exposed
    //                        iterators.
    //
    //    functions:
    //        default constructor
    //                        Since lexers are instantiated as base classes
    //                        only it might be a good idea to make this
    //                        constructor protected.
    //        begin, end      Return a pair of iterators, when dereferenced
    //                        returning the sequence of tokens recognized in
    //                        the input stream given as the parameters to the
    //                        begin() function.
    //        add_token       Should add the definition of a token to be
    //                        recognized by this lexer.
    //        clear           Should delete all current token definitions
    //                        associated with the given state of this lexer
    //                        object.
    //
    //    template parameters:
    //        Iterator        The type of the iterator used to access the
    //                        underlying character stream.
    //        Token           The type of the tokens to be returned from the
    //                        exposed token iterator.
    //        Functor         The type of the InputPolicy to use to instantiate
    //                        the multi_pass iterator type to be used as the
    //                        token iterator (returned from begin()/end()).
    //
    ///////////////////////////////////////////////////////////////////////////

    ///////////////////////////////////////////////////////////////////////////
    //
    //  The lexer class is a implementation of a Spirit.Lex lexer on
    //  top of Ben Hanson's lexertl library as outlined above (For more
    //  information about lexertl go here: http://www.benhanson.net/lexertl.html).
    //
    //  This class is supposed to be used as the first and only template
    //  parameter while instantiating instances of a lex::lexer class.
    //
    ///////////////////////////////////////////////////////////////////////////
    template <typename Token = token<>
      , typename Iterator = typename Token::iterator_type
      , typename Functor = functor<Token, lexertl::detail::data, Iterator> >
    class lexer
    {
    private:
        struct dummy { void true_() {} };
        typedef void (dummy::*safe_bool)();

        static std::size_t const all_states_id = static_cast<std::size_t>(-2);

    public:
        operator safe_bool() const
            { return initialized_dfa_ ? &dummy::true_ : 0; }

        typedef typename boost::detail::iterator_traits<Iterator>::value_type
            char_type;
        typedef std::basic_string<char_type> string_type;

        typedef boost::lexer::basic_rules<char_type> basic_rules_type;

        //  Every lexer type to be used as a lexer for Spirit has to conform to
        //  a public interface .
        typedef Token token_type;
        typedef typename Token::id_type id_type;
        typedef iterator<Functor> iterator_type;

    private:
        // this type is purely used for the iterator_type construction below
        struct iterator_data_type
        {
            typedef typename Functor::semantic_actions_type semantic_actions_type;

            iterator_data_type(
                    boost::lexer::basic_state_machine<char_type> const& sm
                  , boost::lexer::basic_rules<char_type> const& rules
                  , semantic_actions_type const& actions)
              : state_machine_(sm), rules_(rules), actions_(actions)
            {}

            boost::lexer::basic_state_machine<char_type> const& state_machine_;
            boost::lexer::basic_rules<char_type> const& rules_;
            semantic_actions_type const& actions_;

        private:
            // silence MSVC warning C4512: assignment operator could not be generated
            iterator_data_type& operator= (iterator_data_type const&);
        };

    public:
        //  Return the start iterator usable for iterating over the generated
        //  tokens.
        iterator_type begin(Iterator& first, Iterator const& last
          , char_type const* initial_state = 0) const
        {
            if (!init_dfa())    // never minimize DFA for dynamic lexers
                return iterator_type();

            iterator_data_type iterator_data(state_machine_, rules_, actions_);
            return iterator_type(iterator_data, first, last, initial_state);
        }

        //  Return the end iterator usable to stop iterating over the generated
        //  tokens.
        iterator_type end() const
        {
            return iterator_type();
        }

    protected:
        //  Lexer instances can be created by means of a derived class only.
        lexer(unsigned int flags)
          : flags_(detail::map_flags(flags))
          , rules_(flags_)
          , initialized_dfa_(false)
        {}

    public:
        // interface for token definition management
        std::size_t add_token(char_type const* state, char_type tokendef,
            std::size_t token_id, char_type const* targetstate)
        {
            add_state(state);
            initialized_dfa_ = false;
            if (state == all_states())
                return rules_.add(state, detail::escape(tokendef), token_id, rules_.dot());

            if (0 == targetstate)
                targetstate = state;
            else
                add_state(targetstate);
            return rules_.add(state, detail::escape(tokendef), token_id, targetstate);
        }
        std::size_t add_token(char_type const* state, string_type const& tokendef,
            std::size_t token_id, char_type const* targetstate)
        {
            add_state(state);
            initialized_dfa_ = false;
            if (state == all_states())
                return rules_.add(state, tokendef, token_id, rules_.dot());

            if (0 == targetstate)
                targetstate = state;
            else
                add_state(targetstate);
            return rules_.add(state, tokendef, token_id, targetstate);
        }

        // interface for pattern definition management
        void add_pattern (char_type const* state, string_type const& name,
            string_type const& patterndef)
        {
            add_state(state);
            rules_.add_macro(name.c_str(), patterndef);
            initialized_dfa_ = false;
        }

        boost::lexer::rules const& get_rules() const { return rules_; }

        void clear(char_type const* state)
        {
            std::size_t s = rules_.state(state);
            if (boost::lexer::npos != s)
                rules_.clear(state);
            initialized_dfa_ = false;
        }
        std::size_t add_state(char_type const* state)
        {
            if (state == all_states())
                return all_states_id;

            std::size_t stateid = rules_.state(state);
            if (boost::lexer::npos == stateid) {
                stateid = rules_.add_state(state);
                initialized_dfa_ = false;
            }
            return stateid;
        }
        string_type initial_state() const
        {
            return string_type(rules_.initial());
        }
        string_type all_states() const
        {
            return string_type(rules_.all_states());
        }

        //  Register a semantic action with the given id
        template <typename F>
        void add_action(std::size_t unique_id, std::size_t state, F act)
        {
            // If you see an error here stating add_action is not a member of
            // fusion::unused_type then you are probably having semantic actions
            // attached to at least one token in the lexer definition without
            // using the lex::lexertl::actor_lexer<> as its base class.
            typedef typename Functor::wrap_action_type wrapper_type;
            if (state == all_states_id) {
                // add the action to all known states
                typedef typename
                    basic_rules_type::string_size_t_map::value_type
                state_type;

                std::size_t states = rules_.statemap().size();
                BOOST_FOREACH(state_type const& s, rules_.statemap()) {
                    for (std::size_t j = 0; j < states; ++j)
                        actions_.add_action(unique_id + j, s.second, wrapper_type::call(act));
                }
            }
            else {
                actions_.add_action(unique_id, state, wrapper_type::call(act));
            }
        }
//         template <typename F>
//         void add_action(std::size_t unique_id, char_type const* state, F act)
//         {
//             typedef typename Functor::wrap_action_type wrapper_type;
//             actions_.add_action(unique_id, add_state(state), wrapper_type::call(act));
//         }

        // We do not minimize the state machine by default anymore because
        // Ben said: "If you can afford to generate a lexer at runtime, there
        //            is little point in calling minimise."
        // Go figure.
        bool init_dfa(bool minimize = false) const
        {
            if (!initialized_dfa_) {
                state_machine_.clear();
                typedef boost::lexer::basic_generator<char_type> generator;
                generator::build (rules_, state_machine_);
                if (minimize)
                    generator::minimise (state_machine_);

#if defined(BOOST_SPIRIT_LEXERTL_DEBUG)
                boost::lexer::debug::dump(state_machine_, std::cerr);
#endif
                initialized_dfa_ = true;

//                 // release memory held by rules description
//                 basic_rules_type rules;
//                 rules.init_state_info(rules_);        // preserve states
//                 std::swap(rules, rules_);
            }
            return true;
        }

    private:
        // lexertl specific data
        mutable boost::lexer::basic_state_machine<char_type> state_machine_;
        boost::lexer::regex_flags flags_;
        /*mutable*/ basic_rules_type rules_;

        typename Functor::semantic_actions_type actions_;
        mutable bool initialized_dfa_;

        // generator functions must be able to access members directly
        template <typename Lexer, typename F>
        friend bool generate_static(Lexer const&
          , std::basic_ostream<typename Lexer::char_type>&
          , typename Lexer::char_type const*, F);
    };

    ///////////////////////////////////////////////////////////////////////////
    //
    //  The actor_lexer class is another implementation of a Spirit.Lex
    //  lexer on top of Ben Hanson's lexertl library as outlined above (For
    //  more information about lexertl go here:
    //  http://www.benhanson.net/lexertl.html).
    //
    //  The only difference to the lexer class above is that token_def
    //  definitions may have semantic (lexer) actions attached while being
    //  defined:
    //
    //      int w;
    //      token_def word = "[^ \t\n]+";
    //      self = word[++ref(w)];        // see example: word_count_lexer
    //
    //  This class is supposed to be used as the first and only template
    //  parameter while instantiating instances of a lex::lexer class.
    //
    ///////////////////////////////////////////////////////////////////////////
    template <typename Token = token<>
      , typename Iterator = typename Token::iterator_type
      , typename Functor = functor<Token, lexertl::detail::data, Iterator, mpl::true_> >
    class actor_lexer : public lexer<Token, Iterator, Functor>
    {
    protected:
        //  Lexer instances can be created by means of a derived class only.
        actor_lexer(unsigned int flags)
          : lexer<Token, Iterator, Functor>(flags) {}
    };

}}}}

#endif