libstdc++
codecvt_specializations.h
Go to the documentation of this file.
1 // Locale support (codecvt) -*- C++ -*-
2 
3 // Copyright (C) 2000-2023 Free Software Foundation, Inc.
4 //
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
9 // any later version.
10 
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
15 
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
19 
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 // <http://www.gnu.org/licenses/>.
24 
25 //
26 // ISO C++ 14882: 22.2.1.5 Template class codecvt
27 //
28 
29 // Written by Benjamin Kosnik <bkoz@redhat.com>
30 
31 /** @file ext/codecvt_specializations.h
32  * This file is a GNU extension to the Standard C++ Library.
33  */
34 
35 #ifndef _EXT_CODECVT_SPECIALIZATIONS_H
36 #define _EXT_CODECVT_SPECIALIZATIONS_H 1
37 
38 #include <bits/requires_hosted.h> // GNU extensions are currently omitted
39 
40 #include <bits/c++config.h>
41 #include <locale>
42 #include <iconv.h>
43 
44 namespace __gnu_cxx _GLIBCXX_VISIBILITY(default)
45 {
46 _GLIBCXX_BEGIN_NAMESPACE_VERSION
47 _GLIBCXX_BEGIN_NAMESPACE_CXX11
48 
49  /// Extension to use iconv for dealing with character encodings.
50  // This includes conversions and comparisons between various character
51  // sets. This object encapsulates data that may need to be shared between
52  // char_traits, codecvt and ctype.
54  {
55  public:
56  // Types:
57  // NB: A conversion descriptor subsumes and enhances the
58  // functionality of a simple state type such as mbstate_t.
59  typedef iconv_t descriptor_type;
60 
61  protected:
62  // Name of internal character set encoding.
63  std::string _M_int_enc;
64 
65  // Name of external character set encoding.
66  std::string _M_ext_enc;
67 
68  // Conversion descriptor between external encoding to internal encoding.
69  descriptor_type _M_in_desc;
70 
71  // Conversion descriptor between internal encoding to external encoding.
72  descriptor_type _M_out_desc;
73 
74  // The byte-order marker for the external encoding, if necessary.
75  int _M_ext_bom;
76 
77  // The byte-order marker for the internal encoding, if necessary.
78  int _M_int_bom;
79 
80  // Number of external bytes needed to construct one complete
81  // character in the internal encoding.
82  // NB: -1 indicates variable, or stateful, encodings.
83  int _M_bytes;
84 
85  public:
86  explicit
88  : _M_in_desc(0), _M_out_desc(0), _M_ext_bom(0), _M_int_bom(0), _M_bytes(0)
89  { }
90 
91  explicit
92  encoding_state(const char* __int, const char* __ext,
93  int __ibom = 0, int __ebom = 0, int __bytes = 1)
94  : _M_int_enc(__int), _M_ext_enc(__ext), _M_in_desc(0), _M_out_desc(0),
95  _M_ext_bom(__ebom), _M_int_bom(__ibom), _M_bytes(__bytes)
96  { init(); }
97 
98  // 21.1.2 traits typedefs
99  // p4
100  // typedef STATE_T state_type
101  // requires: state_type shall meet the requirements of
102  // CopyConstructible types (20.1.3)
103  // NB: This does not preserve the actual state of the conversion
104  // descriptor member, but it does duplicate the encoding
105  // information.
106  encoding_state(const encoding_state& __obj) : _M_in_desc(0), _M_out_desc(0)
107  { construct(__obj); }
108 
109  // Need assignment operator as well.
111  operator=(const encoding_state& __obj)
112  {
113  construct(__obj);
114  return *this;
115  }
116 
117  ~encoding_state()
118  { destroy(); }
119 
120  bool
121  good() const throw()
122  {
123  const descriptor_type __err = (iconv_t)(-1);
124  bool __test = _M_in_desc && _M_in_desc != __err;
125  __test &= _M_out_desc && _M_out_desc != __err;
126  return __test;
127  }
128 
129  int
130  character_ratio() const
131  { return _M_bytes; }
132 
133  const std::string
134  internal_encoding() const
135  { return _M_int_enc; }
136 
137  int
138  internal_bom() const
139  { return _M_int_bom; }
140 
141  const std::string
142  external_encoding() const
143  { return _M_ext_enc; }
144 
145  int
146  external_bom() const
147  { return _M_ext_bom; }
148 
149  const descriptor_type&
150  in_descriptor() const
151  { return _M_in_desc; }
152 
153  const descriptor_type&
154  out_descriptor() const
155  { return _M_out_desc; }
156 
157  protected:
158  void
159  init()
160  {
161  const descriptor_type __err = (iconv_t)(-1);
162  const bool __have_encodings = _M_int_enc.size() && _M_ext_enc.size();
163  if (!_M_in_desc && __have_encodings)
164  {
165  _M_in_desc = iconv_open(_M_int_enc.c_str(), _M_ext_enc.c_str());
166  if (_M_in_desc == __err)
167  std::__throw_runtime_error(__N("encoding_state::_M_init "
168  "creating iconv input descriptor failed"));
169  }
170  if (!_M_out_desc && __have_encodings)
171  {
172  _M_out_desc = iconv_open(_M_ext_enc.c_str(), _M_int_enc.c_str());
173  if (_M_out_desc == __err)
174  std::__throw_runtime_error(__N("encoding_state::_M_init "
175  "creating iconv output descriptor failed"));
176  }
177  }
178 
179  void
180  construct(const encoding_state& __obj)
181  {
182  destroy();
183  _M_int_enc = __obj._M_int_enc;
184  _M_ext_enc = __obj._M_ext_enc;
185  _M_ext_bom = __obj._M_ext_bom;
186  _M_int_bom = __obj._M_int_bom;
187  _M_bytes = __obj._M_bytes;
188  init();
189  }
190 
191  void
192  destroy() throw()
193  {
194  const descriptor_type __err = (iconv_t)(-1);
195  if (_M_in_desc && _M_in_desc != __err)
196  {
197  iconv_close(_M_in_desc);
198  _M_in_desc = 0;
199  }
200  if (_M_out_desc && _M_out_desc != __err)
201  {
202  iconv_close(_M_out_desc);
203  _M_out_desc = 0;
204  }
205  }
206  };
207 
208  /// encoding_char_traits
209  // Custom traits type with encoding_state for the state type, and the
210  // associated fpos<encoding_state> for the position type, all other
211  // bits equivalent to the required char_traits instantiations.
212  template<typename _CharT>
214  : public std::char_traits<_CharT>
215  {
216  typedef encoding_state state_type;
217  typedef typename std::fpos<state_type> pos_type;
218  };
219 
220 _GLIBCXX_END_NAMESPACE_CXX11
221 _GLIBCXX_END_NAMESPACE_VERSION
222 } // namespace
223 
224 
225 namespace std _GLIBCXX_VISIBILITY(default)
226 {
227 _GLIBCXX_BEGIN_NAMESPACE_VERSION
228 
230 
231  /// codecvt<InternT, _ExternT, encoding_state> specialization.
232  // This partial specialization takes advantage of iconv to provide
233  // code conversions between a large number of character encodings.
234  template<typename _InternT, typename _ExternT>
235  class codecvt<_InternT, _ExternT, encoding_state>
236  : public __codecvt_abstract_base<_InternT, _ExternT, encoding_state>
237  {
238  public:
239  // Types:
240  typedef codecvt_base::result result;
241  typedef _InternT intern_type;
242  typedef _ExternT extern_type;
244  typedef state_type::descriptor_type descriptor_type;
245 
246  // Data Members:
247  static locale::id id;
248 
249  explicit
250  codecvt(size_t __refs = 0)
252  { }
253 
254  explicit
255  codecvt(state_type& __enc, size_t __refs = 0)
257  { }
258 
259  protected:
260  virtual
261  ~codecvt() { }
262 
263  virtual result
264  do_out(state_type& __state, const intern_type* __from,
265  const intern_type* __from_end, const intern_type*& __from_next,
266  extern_type* __to, extern_type* __to_end,
267  extern_type*& __to_next) const;
268 
269  virtual result
270  do_unshift(state_type& __state, extern_type* __to,
271  extern_type* __to_end, extern_type*& __to_next) const;
272 
273  virtual result
274  do_in(state_type& __state, const extern_type* __from,
275  const extern_type* __from_end, const extern_type*& __from_next,
276  intern_type* __to, intern_type* __to_end,
277  intern_type*& __to_next) const;
278 
279  virtual int
280  do_encoding() const throw();
281 
282  virtual bool
283  do_always_noconv() const throw();
284 
285  virtual int
286  do_length(state_type&, const extern_type* __from,
287  const extern_type* __end, size_t __max) const;
288 
289  virtual int
290  do_max_length() const throw();
291  };
292 
293  template<typename _InternT, typename _ExternT>
294  locale::id
296 
297  // This adaptor works around the signature problems of the second
298  // argument to iconv(): SUSv2 and others use 'const char**', but glibc 2.2
299  // uses 'char**', which matches the POSIX 1003.1-2001 standard.
300  // Using this adaptor, g++ will do the work for us.
301  template<typename _Tp>
302  inline size_t
303  __iconv_adaptor(size_t(*__func)(iconv_t, _Tp, size_t*, char**, size_t*),
304  iconv_t __cd, char** __inbuf, size_t* __inbytes,
305  char** __outbuf, size_t* __outbytes)
306  { return __func(__cd, (_Tp)__inbuf, __inbytes, __outbuf, __outbytes); }
307 
308  template<typename _InternT, typename _ExternT>
309  codecvt_base::result
311  do_out(state_type& __state, const intern_type* __from,
312  const intern_type* __from_end, const intern_type*& __from_next,
313  extern_type* __to, extern_type* __to_end,
314  extern_type*& __to_next) const
315  {
316  result __ret = codecvt_base::error;
317  if (__state.good())
318  {
319  const descriptor_type& __desc = __state.out_descriptor();
320  const size_t __fmultiple = sizeof(intern_type);
321  size_t __fbytes = __fmultiple * (__from_end - __from);
322  const size_t __tmultiple = sizeof(extern_type);
323  size_t __tbytes = __tmultiple * (__to_end - __to);
324 
325  // Argument list for iconv specifies a byte sequence. Thus,
326  // all to/from arrays must be brutally casted to char*.
327  char* __cto = reinterpret_cast<char*>(__to);
328  char* __cfrom;
329  size_t __conv;
330 
331  // Some encodings need a byte order marker as the first item
332  // in the byte stream, to designate endian-ness. The default
333  // value for the byte order marker is NULL, so if this is
334  // the case, it's not necessary and we can just go on our
335  // merry way.
336  int __int_bom = __state.internal_bom();
337  if (__int_bom)
338  {
339  size_t __size = __from_end - __from;
340  intern_type* __cfixed = static_cast<intern_type*>
341  (__builtin_alloca(sizeof(intern_type) * (__size + 1)));
342  __cfixed[0] = static_cast<intern_type>(__int_bom);
343  char_traits<intern_type>::copy(__cfixed + 1, __from, __size);
344  __cfrom = reinterpret_cast<char*>(__cfixed);
345  __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
346  &__fbytes, &__cto, &__tbytes);
347  }
348  else
349  {
350  intern_type* __cfixed = const_cast<intern_type*>(__from);
351  __cfrom = reinterpret_cast<char*>(__cfixed);
352  __conv = __iconv_adaptor(iconv, __desc, &__cfrom, &__fbytes,
353  &__cto, &__tbytes);
354  }
355 
356  if (__conv != size_t(-1))
357  {
358  __from_next = reinterpret_cast<const intern_type*>(__cfrom);
359  __to_next = reinterpret_cast<extern_type*>(__cto);
360  __ret = codecvt_base::ok;
361  }
362  else
363  {
364  if (__fbytes < __fmultiple * (__from_end - __from))
365  {
366  __from_next = reinterpret_cast<const intern_type*>(__cfrom);
367  __to_next = reinterpret_cast<extern_type*>(__cto);
368  __ret = codecvt_base::partial;
369  }
370  else
371  __ret = codecvt_base::error;
372  }
373  }
374  return __ret;
375  }
376 
377  template<typename _InternT, typename _ExternT>
378  codecvt_base::result
380  do_unshift(state_type& __state, extern_type* __to,
381  extern_type* __to_end, extern_type*& __to_next) const
382  {
383  result __ret = codecvt_base::error;
384  if (__state.good())
385  {
386  const descriptor_type& __desc = __state.in_descriptor();
387  const size_t __tmultiple = sizeof(intern_type);
388  size_t __tlen = __tmultiple * (__to_end - __to);
389 
390  // Argument list for iconv specifies a byte sequence. Thus,
391  // all to/from arrays must be brutally casted to char*.
392  char* __cto = reinterpret_cast<char*>(__to);
393  size_t __conv = __iconv_adaptor(iconv,__desc, 0, 0,
394  &__cto, &__tlen);
395 
396  if (__conv != size_t(-1))
397  {
398  __to_next = reinterpret_cast<extern_type*>(__cto);
399  if (__tlen == __tmultiple * (__to_end - __to))
400  __ret = codecvt_base::noconv;
401  else if (__tlen == 0)
402  __ret = codecvt_base::ok;
403  else
404  __ret = codecvt_base::partial;
405  }
406  else
407  __ret = codecvt_base::error;
408  }
409  return __ret;
410  }
411 
412  template<typename _InternT, typename _ExternT>
413  codecvt_base::result
414  codecvt<_InternT, _ExternT, encoding_state>::
415  do_in(state_type& __state, const extern_type* __from,
416  const extern_type* __from_end, const extern_type*& __from_next,
417  intern_type* __to, intern_type* __to_end,
418  intern_type*& __to_next) const
419  {
420  result __ret = codecvt_base::error;
421  if (__state.good())
422  {
423  const descriptor_type& __desc = __state.in_descriptor();
424  const size_t __fmultiple = sizeof(extern_type);
425  size_t __flen = __fmultiple * (__from_end - __from);
426  const size_t __tmultiple = sizeof(intern_type);
427  size_t __tlen = __tmultiple * (__to_end - __to);
428 
429  // Argument list for iconv specifies a byte sequence. Thus,
430  // all to/from arrays must be brutally casted to char*.
431  char* __cto = reinterpret_cast<char*>(__to);
432  char* __cfrom;
433  size_t __conv;
434 
435  // Some encodings need a byte order marker as the first item
436  // in the byte stream, to designate endian-ness. The default
437  // value for the byte order marker is NULL, so if this is
438  // the case, it's not necessary and we can just go on our
439  // merry way.
440  int __ext_bom = __state.external_bom();
441  if (__ext_bom)
442  {
443  size_t __size = __from_end - __from;
444  extern_type* __cfixed = static_cast<extern_type*>
445  (__builtin_alloca(sizeof(extern_type) * (__size + 1)));
446  __cfixed[0] = static_cast<extern_type>(__ext_bom);
447  char_traits<extern_type>::copy(__cfixed + 1, __from, __size);
448  __cfrom = reinterpret_cast<char*>(__cfixed);
449  __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
450  &__flen, &__cto, &__tlen);
451  }
452  else
453  {
454  extern_type* __cfixed = const_cast<extern_type*>(__from);
455  __cfrom = reinterpret_cast<char*>(__cfixed);
456  __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
457  &__flen, &__cto, &__tlen);
458  }
459 
460 
461  if (__conv != size_t(-1))
462  {
463  __from_next = reinterpret_cast<const extern_type*>(__cfrom);
464  __to_next = reinterpret_cast<intern_type*>(__cto);
465  __ret = codecvt_base::ok;
466  }
467  else
468  {
469  if (__flen < static_cast<size_t>(__from_end - __from))
470  {
471  __from_next = reinterpret_cast<const extern_type*>(__cfrom);
472  __to_next = reinterpret_cast<intern_type*>(__cto);
473  __ret = codecvt_base::partial;
474  }
475  else
476  __ret = codecvt_base::error;
477  }
478  }
479  return __ret;
480  }
481 
482  template<typename _InternT, typename _ExternT>
483  int
484  codecvt<_InternT, _ExternT, encoding_state>::
485  do_encoding() const throw()
486  {
487  int __ret = 0;
488  if (sizeof(_ExternT) <= sizeof(_InternT))
489  __ret = sizeof(_InternT) / sizeof(_ExternT);
490  return __ret;
491  }
492 
493  template<typename _InternT, typename _ExternT>
494  bool
495  codecvt<_InternT, _ExternT, encoding_state>::
496  do_always_noconv() const throw()
497  { return false; }
498 
499  template<typename _InternT, typename _ExternT>
500  int
501  codecvt<_InternT, _ExternT, encoding_state>::
502  do_length(state_type&, const extern_type* __from,
503  const extern_type* __end, size_t __max) const
504  { return std::min(__max, static_cast<size_t>(__end - __from)); }
505 
506  // _GLIBCXX_RESOLVE_LIB_DEFECTS
507  // 74. Garbled text for codecvt::do_max_length
508  template<typename _InternT, typename _ExternT>
509  int
510  codecvt<_InternT, _ExternT, encoding_state>::
511  do_max_length() const throw()
512  { return 1; }
513 
514 _GLIBCXX_END_NAMESPACE_VERSION
515 } // namespace
516 
517 #endif
const _CharT * c_str() const noexcept
Return const pointer to null-terminated contents.
Definition: cow_string.h:2215
virtual result do_out(state_type &__state, const intern_type *__from, const intern_type *__from_end, const intern_type *&__from_next, extern_type *__to, extern_type *__to_end, extern_type *&__to_next) const
Convert from internal to external character set.
constexpr const _Tp & min(const _Tp &, const _Tp &)
This does what you think it does.
Definition: stl_algobase.h:233
Primary class template codecvt.NB: Generic, mostly useless implementation.
Definition: codecvt.h:277
size_type size() const noexcept
Returns the number of characters in the string, not including any null-termination.
Definition: cow_string.h:916
GNU extensions for public use.
Class representing stream positions.
Definition: postypes.h:82
Common base for codecvt functions.
Definition: codecvt.h:71
ISO C++ entities toplevel namespace is std.
Facet ID class.The ID class provides facets with an index used to identify them. Every facet class mu...
Extension to use iconv for dealing with character encodings.
Basis for explicit traits specializations.
Definition: char_traits.h:341