Boost.Locale
generic_codecvt.hpp
1 //
2 // Copyright (c) 2015 Artyom Beilis (Tonkikh)
3 //
4 // Distributed under the Boost Software License, Version 1.0. (See
5 // accompanying file LICENSE_1_0.txt or copy at
6 // http://www.boost.org/LICENSE_1_0.txt)
7 //
8 #ifndef BOOST_LOCALE_GENERIC_CODECVT_HPP
9 #define BOOST_LOCALE_GENERIC_CODECVT_HPP
10 
11 #include <boost/locale/utf.hpp>
12 #include <boost/cstdint.hpp>
13 #include <boost/static_assert.hpp>
14 #include <locale>
15 
16 namespace boost {
17 namespace locale {
18 
19 #ifndef BOOST_LOCALE_DOXYGEN
20 //
21 // Make sure that mbstate can keep 16 bit of UTF-16 sequence
22 //
23 BOOST_STATIC_ASSERT(sizeof(std::mbstate_t)>=2);
24 #endif
25 
26 #if defined(_MSC_VER) && _MSC_VER < 1700
27 // up to MSVC 11 (2012) do_length is non-standard it counts wide characters instead of narrow and does not change mbstate
28 #define BOOST_LOCALE_DO_LENGTH_MBSTATE_CONST
29 #endif
30 
35 public:
42  };
43 };
44 
139 template<typename CharType,typename CodecvtImpl,int CharSize=sizeof(CharType)>
141 
150 template<typename CharType,typename CodecvtImpl>
151 class generic_codecvt<CharType,CodecvtImpl,2> : public std::codecvt<CharType,char,std::mbstate_t>, public generic_codecvt_base
152 {
153 public:
154 
155  typedef CharType uchar;
156 
157  generic_codecvt(size_t refs = 0) :
158  std::codecvt<CharType,char,std::mbstate_t>(refs)
159  {
160  }
161  CodecvtImpl const &implementation() const
162  {
163  return *static_cast<CodecvtImpl const *>(this);
164  }
165 
166 protected:
167 
168 
169  virtual std::codecvt_base::result do_unshift(std::mbstate_t &s,char *from,char * /*to*/,char *&next) const
170  {
171  boost::uint16_t &state = *reinterpret_cast<boost::uint16_t *>(&s);
172 #ifdef DEBUG_CODECVT
173  std::cout << "Entering unshift " << std::hex << state << std::dec << std::endl;
174 #endif
175  if(state != 0)
176  return std::codecvt_base::error;
177  next=from;
178  return std::codecvt_base::ok;
179  }
180  virtual int do_encoding() const throw()
181  {
182  return 0;
183  }
184  virtual int do_max_length() const throw()
185  {
186  return implementation().max_encoding_length();
187  }
188  virtual bool do_always_noconv() const throw()
189  {
190  return false;
191  }
192 
193  virtual int
194  do_length( std::mbstate_t
195  #ifdef BOOST_LOCALE_DO_LENGTH_MBSTATE_CONST
196  const
197  #endif
198  &std_state,
199  char const *from,
200  char const *from_end,
201  size_t max) const
202  {
203  #ifndef BOOST_LOCALE_DO_LENGTH_MBSTATE_CONST
204  char const *save_from = from;
205  boost::uint16_t &state = *reinterpret_cast<boost::uint16_t *>(&std_state);
206  #else
207  size_t save_max = max;
208  boost::uint16_t state = *reinterpret_cast<boost::uint16_t const *>(&std_state);
209  #endif
210 
211  typedef typename CodecvtImpl::state_type state_type;
212  state_type cvt_state = implementation().initial_state(generic_codecvt_base::to_unicode_state);
213  while(max > 0 && from < from_end){
214  char const *prev_from = from;
215  boost::uint32_t ch=implementation().to_unicode(cvt_state,from,from_end);
217  from = prev_from;
218  break;
219  }
220  max --;
221  if(ch > 0xFFFF) {
222  if(state == 0) {
223  from = prev_from;
224  state = 1;
225  }
226  else {
227  state = 0;
228  }
229  }
230  }
231  #ifndef BOOST_LOCALE_DO_LENGTH_MBSTATE_CONST
232  return from - save_from;
233  #else
234  return save_max - max;
235  #endif
236  }
237 
238 
239  virtual std::codecvt_base::result
240  do_in( std::mbstate_t &std_state,
241  char const *from,
242  char const *from_end,
243  char const *&from_next,
244  uchar *to,
245  uchar *to_end,
246  uchar *&to_next) const
247  {
248  std::codecvt_base::result r=std::codecvt_base::ok;
249 
250  // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT())
251  // according to standard. We use it to keep a flag 0/1 for surrogate pair writing
252  //
253  // if 0 no code above >0xFFFF observed, of 1 a code above 0xFFFF observerd
254  // and first pair is written, but no input consumed
255  boost::uint16_t &state = *reinterpret_cast<boost::uint16_t *>(&std_state);
256  typedef typename CodecvtImpl::state_type state_type;
257  state_type cvt_state = implementation().initial_state(generic_codecvt_base::to_unicode_state);
258  while(to < to_end && from < from_end)
259  {
260 #ifdef DEBUG_CODECVT
261  std::cout << "Entering IN--------------" << std::endl;
262  std::cout << "State " << std::hex << state <<std::endl;
263  std::cout << "Left in " << std::dec << from_end - from << " out " << to_end -to << std::endl;
264 #endif
265  char const *from_saved = from;
266 
267  uint32_t ch=implementation().to_unicode(cvt_state,from,from_end);
268 
270  from = from_saved;
271  r=std::codecvt_base::error;
272  break;
273  }
275  from = from_saved;
276  r=std::codecvt_base::partial;
277  break;
278  }
279  // Normal codepoints go direcly to stream
280  if(ch <= 0xFFFF) {
281  *to++=ch;
282  }
283  else {
284  // for other codepoints we do following
285  //
286  // 1. We can't consume our input as we may find ourselfs
287  // in state where all input consumed but not all output written,i.e. only
288  // 1st pair is written
289  // 2. We only write first pair and mark this in the state, we also revert back
290  // the from pointer in order to make sure this codepoint would be read
291  // once again and then we would consume our input together with writing
292  // second surrogate pair
293  ch-=0x10000;
294  boost::uint16_t vh = ch >> 10;
295  boost::uint16_t vl = ch & 0x3FF;
296  boost::uint16_t w1 = vh + 0xD800;
297  boost::uint16_t w2 = vl + 0xDC00;
298  if(state == 0) {
299  from = from_saved;
300  *to++ = w1;
301  state = 1;
302  }
303  else {
304  *to++ = w2;
305  state = 0;
306  }
307  }
308  }
309  from_next=from;
310  to_next=to;
311  if(r == std::codecvt_base::ok && (from!=from_end || state!=0))
312  r = std::codecvt_base::partial;
313 #ifdef DEBUG_CODECVT
314  std::cout << "Returning ";
315  switch(r) {
316  case std::codecvt_base::ok:
317  std::cout << "ok" << std::endl;
318  break;
319  case std::codecvt_base::partial:
320  std::cout << "partial" << std::endl;
321  break;
322  case std::codecvt_base::error:
323  std::cout << "error" << std::endl;
324  break;
325  default:
326  std::cout << "other" << std::endl;
327  break;
328  }
329  std::cout << "State " << std::hex << state <<std::endl;
330  std::cout << "Left in " << std::dec << from_end - from << " out " << to_end -to << std::endl;
331 #endif
332  return r;
333  }
334 
335  virtual std::codecvt_base::result
336  do_out( std::mbstate_t &std_state,
337  uchar const *from,
338  uchar const *from_end,
339  uchar const *&from_next,
340  char *to,
341  char *to_end,
342  char *&to_next) const
343  {
344  std::codecvt_base::result r=std::codecvt_base::ok;
345  // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT())
346  // according to standard. We assume that sizeof(mbstate_t) >=2 in order
347  // to be able to store first observerd surrogate pair
348  //
349  // State: state!=0 - a first surrogate pair was observerd (state = first pair),
350  // we expect the second one to come and then zero the state
352  boost::uint16_t &state = *reinterpret_cast<boost::uint16_t *>(&std_state);
353  typedef typename CodecvtImpl::state_type state_type;
354  state_type cvt_state = implementation().initial_state(generic_codecvt_base::from_unicode_state);
355  while(to < to_end && from < from_end)
356  {
357 #ifdef DEBUG_CODECVT
358  std::cout << "Entering OUT --------------" << std::endl;
359  std::cout << "State " << std::hex << state <<std::endl;
360  std::cout << "Left in " << std::dec << from_end - from << " out " << to_end -to << std::endl;
361 #endif
362  boost::uint32_t ch=0;
363  if(state != 0) {
364  // if the state idecates that 1st surrogate pair was written
365  // we should make sure that the second one that comes is actually
366  // second surrogate
367  boost::uint16_t w1 = state;
368  boost::uint16_t w2 = *from;
369  // we don't forward from as writing may fail to incomplete or
370  // partial conversion
371  if(0xDC00 <= w2 && w2<=0xDFFF) {
372  boost::uint16_t vh = w1 - 0xD800;
373  boost::uint16_t vl = w2 - 0xDC00;
374  ch=((uint32_t(vh) << 10) | vl) + 0x10000;
375  }
376  else {
377  // Invalid surrogate
378  r=std::codecvt_base::error;
379  break;
380  }
381  }
382  else {
383  ch = *from;
384  if(0xD800 <= ch && ch<=0xDBFF) {
385  // if this is a first surrogate pair we put
386  // it into the state and consume it, note we don't
387  // go forward as it should be illegal so we increase
388  // the from pointer manually
389  state = ch;
390  from++;
391  continue;
392  }
393  else if(0xDC00 <= ch && ch<=0xDFFF) {
394  // if we observe second surrogate pair and
395  // first only may be expected we should break from the loop with error
396  // as it is illegal input
397  r=std::codecvt_base::error;
398  break;
399  }
400  }
402  r=std::codecvt_base::error;
403  break;
404  }
405  boost::uint32_t len = implementation().from_unicode(cvt_state,ch,to,to_end);
406  if(len == boost::locale::utf::incomplete) {
407  r=std::codecvt_base::partial;
408  break;
409  }
410  else if(len == boost::locale::utf::illegal) {
411  r=std::codecvt_base::error;
412  break;
413  }
414  else
415  to+= len;
416  state = 0;
417  from++;
418  }
419  from_next=from;
420  to_next=to;
421  if(r==std::codecvt_base::ok && from!=from_end)
422  r = std::codecvt_base::partial;
423 #ifdef DEBUG_CODECVT
424  std::cout << "Returning ";
425  switch(r) {
426  case std::codecvt_base::ok:
427  std::cout << "ok" << std::endl;
428  break;
429  case std::codecvt_base::partial:
430  std::cout << "partial" << std::endl;
431  break;
432  case std::codecvt_base::error:
433  std::cout << "error" << std::endl;
434  break;
435  default:
436  std::cout << "other" << std::endl;
437  break;
438  }
439  std::cout << "State " << std::hex << state <<std::endl;
440  std::cout << "Left in " << std::dec << from_end - from << " out " << to_end -to << std::endl;
441 #endif
442  return r;
443  }
444 
445 };
446 
453 template<typename CharType,typename CodecvtImpl>
454 class generic_codecvt<CharType,CodecvtImpl,4> : public std::codecvt<CharType,char,std::mbstate_t>, public generic_codecvt_base
455 {
456 public:
457  typedef CharType uchar;
458 
459  generic_codecvt(size_t refs = 0) :
460  std::codecvt<CharType,char,std::mbstate_t>(refs)
461  {
462  }
463 
464  CodecvtImpl const &implementation() const
465  {
466  return *static_cast<CodecvtImpl const *>(this);
467  }
468 
469 protected:
470 
471  virtual std::codecvt_base::result do_unshift(std::mbstate_t &/*s*/,char *from,char * /*to*/,char *&next) const
472  {
473  next=from;
474  return std::codecvt_base::ok;
475  }
476  virtual int do_encoding() const throw()
477  {
478  return 0;
479  }
480  virtual int do_max_length() const throw()
481  {
482  return implementation().max_encoding_length();
483  }
484  virtual bool do_always_noconv() const throw()
485  {
486  return false;
487  }
488 
489  virtual int
490  do_length( std::mbstate_t
491  #ifdef BOOST_LOCALE_DO_LENGTH_MBSTATE_CONST
492  const
493  #endif
494  &/*state*/,
495  char const *from,
496  char const *from_end,
497  size_t max) const
498  {
499  #ifndef BOOST_LOCALE_DO_LENGTH_MBSTATE_CONST
500  char const *start_from = from;
501  #else
502  size_t save_max = max;
503  #endif
504  typedef typename CodecvtImpl::state_type state_type;
505  state_type cvt_state = implementation().initial_state(generic_codecvt_base::to_unicode_state);
506  while(max > 0 && from < from_end){
507  char const *save_from = from;
508  boost::uint32_t ch=implementation().to_unicode(cvt_state,from,from_end);
510  from = save_from;
511  break;
512  }
513  max--;
514  }
515  #ifndef BOOST_LOCALE_DO_LENGTH_MBSTATE_CONST
516  return from - start_from;
517  #else
518  return save_max - max;
519  #endif
520  }
521 
522 
523  virtual std::codecvt_base::result
524  do_in( std::mbstate_t &/*state*/,
525  char const *from,
526  char const *from_end,
527  char const *&from_next,
528  uchar *to,
529  uchar *to_end,
530  uchar *&to_next) const
531  {
532  std::codecvt_base::result r=std::codecvt_base::ok;
533 
534  // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT())
535  // according to standard. We use it to keep a flag 0/1 for surrogate pair writing
536  //
537  // if 0 no code above >0xFFFF observed, of 1 a code above 0xFFFF observerd
538  // and first pair is written, but no input consumed
539  typedef typename CodecvtImpl::state_type state_type;
540  state_type cvt_state = implementation().initial_state(generic_codecvt_base::to_unicode_state);
541  while(to < to_end && from < from_end)
542  {
543 #ifdef DEBUG_CODECVT
544  std::cout << "Entering IN--------------" << std::endl;
545  std::cout << "State " << std::hex << state <<std::endl;
546  std::cout << "Left in " << std::dec << from_end - from << " out " << to_end -to << std::endl;
547 #endif
548  char const *from_saved = from;
549 
550  uint32_t ch=implementation().to_unicode(cvt_state,from,from_end);
551 
553  r=std::codecvt_base::error;
554  from = from_saved;
555  break;
556  }
558  r=std::codecvt_base::partial;
559  from=from_saved;
560  break;
561  }
562  *to++=ch;
563  }
564  from_next=from;
565  to_next=to;
566  if(r == std::codecvt_base::ok && from!=from_end)
567  r = std::codecvt_base::partial;
568 #ifdef DEBUG_CODECVT
569  std::cout << "Returning ";
570  switch(r) {
571  case std::codecvt_base::ok:
572  std::cout << "ok" << std::endl;
573  break;
574  case std::codecvt_base::partial:
575  std::cout << "partial" << std::endl;
576  break;
577  case std::codecvt_base::error:
578  std::cout << "error" << std::endl;
579  break;
580  default:
581  std::cout << "other" << std::endl;
582  break;
583  }
584  std::cout << "State " << std::hex << state <<std::endl;
585  std::cout << "Left in " << std::dec << from_end - from << " out " << to_end -to << std::endl;
586 #endif
587  return r;
588  }
589 
590  virtual std::codecvt_base::result
591  do_out( std::mbstate_t &/*std_state*/,
592  uchar const *from,
593  uchar const *from_end,
594  uchar const *&from_next,
595  char *to,
596  char *to_end,
597  char *&to_next) const
598  {
599  std::codecvt_base::result r=std::codecvt_base::ok;
600  typedef typename CodecvtImpl::state_type state_type;
601  state_type cvt_state = implementation().initial_state(generic_codecvt_base::from_unicode_state);
602  while(to < to_end && from < from_end)
603  {
604 #ifdef DEBUG_CODECVT
605  std::cout << "Entering OUT --------------" << std::endl;
606  std::cout << "State " << std::hex << state <<std::endl;
607  std::cout << "Left in " << std::dec << from_end - from << " out " << to_end -to << std::endl;
608 #endif
609  boost::uint32_t ch=0;
610  ch = *from;
612  r=std::codecvt_base::error;
613  break;
614  }
615  boost::uint32_t len = implementation().from_unicode(cvt_state,ch,to,to_end);
616  if(len == boost::locale::utf::incomplete) {
617  r=std::codecvt_base::partial;
618  break;
619  }
620  else if(len == boost::locale::utf::illegal) {
621  r=std::codecvt_base::error;
622  break;
623  }
624  to+=len;
625  from++;
626  }
627  from_next=from;
628  to_next=to;
629  if(r==std::codecvt_base::ok && from!=from_end)
630  r = std::codecvt_base::partial;
631 #ifdef DEBUG_CODECVT
632  std::cout << "Returning ";
633  switch(r) {
634  case std::codecvt_base::ok:
635  std::cout << "ok" << std::endl;
636  break;
637  case std::codecvt_base::partial:
638  std::cout << "partial" << std::endl;
639  break;
640  case std::codecvt_base::error:
641  std::cout << "error" << std::endl;
642  break;
643  default:
644  std::cout << "other" << std::endl;
645  break;
646  }
647  std::cout << "State " << std::hex << state <<std::endl;
648  std::cout << "Left in " << std::dec << from_end - from << " out " << to_end -to << std::endl;
649 #endif
650  return r;
651  }
652 };
653 
654 
655 template<typename CharType,typename CodecvtImpl>
656 class generic_codecvt<CharType,CodecvtImpl,1> : public std::codecvt<CharType,char,std::mbstate_t>, public generic_codecvt_base
657 {
658 public:
659  typedef CharType uchar;
660 
661  CodecvtImpl const &implementation() const
662  {
663  return *static_cast<CodecvtImpl const *>(this);
664  }
665 
666  generic_codecvt(size_t refs = 0) : std::codecvt<char,char,std::mbstate_t>(refs)
667  {
668  }
669 };
670 
671 } // locale
672 } // namespace boost
673 
674 #endif
675 // vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
bool is_valid_codepoint(code_point v)
the function checks if v is a valid code point
Definition: utf.hpp:49
The state would be used by to_unicode functions.
Definition: generic_codecvt.hpp:40
static const code_point incomplete
Special constant that defines incomplete code point.
Definition: utf.hpp:44
initial_convertion_state
Definition: generic_codecvt.hpp:39
static const code_point illegal
Special constant that defines illegal code point.
Definition: utf.hpp:39
A base class that used to define constants for generic_codecvt.
Definition: generic_codecvt.hpp:34
Geneneric generic codecvt facet, various stateless encodings to UTF-16 and UTF-32 using wchar_t...
Definition: generic_codecvt.hpp:140
The state would be used by from_unicode functions.
Definition: generic_codecvt.hpp:41