From 03a4f5c4eabdff4821989ca43a56eb3f63fc1fde Mon Sep 17 00:00:00 2001 From: Dick Hollenbeck Date: Fri, 6 Dec 2013 14:22:10 -0600 Subject: [PATCH] Move "code costly" functions in experimental class UTF8 to be not "inlined", prefering compactness. --- tools/UTF8.cpp | 130 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 85 insertions(+), 45 deletions(-) diff --git a/tools/UTF8.cpp b/tools/UTF8.cpp index 469a1c13ab..0fd5fb65d5 100644 --- a/tools/UTF8.cpp +++ b/tools/UTF8.cpp @@ -2,68 +2,69 @@ #include #include #include -#include +#include /** * Class UTF8 - * is an 8 bit std::string assuredly encoded in UTF8 that supplies special - * conversion support to and from wxString, and has iteration over - * UTF8 code points. + * is an 8 bit std::string that is assuredly encoded in UTF8, and supplies special + * conversion support to and from wxString, and has iteration over unicode characters. + * + * @author Dick Hollenbeck */ class UTF8 : public std::string { - public: - UTF8( const wxString& o ) : - std::string( (const char*) o.utf8_str() ) - { - // @todo: should not be inline. - } + UTF8( const wxString& o ); + /// This is the only constructor for which you could end up with + /// non-UTF8 encoding, but that would be your fault. UTF8( const char* txt ) : std::string( txt ) { - // ok inline } explicit UTF8( const std::string& o ) : std::string( o ) { - // ok inline } UTF8() : std::string() { - // ok inline } - UTF8& operator = ( const wxString& o ) + UTF8& operator=( const wxString& o ); + + UTF8& operator=( const std::string& o ) { - // @todo: should not be inline. - std::string::operator=( (const char*) o.utf8_str() ); + std::string::operator=( o ); return *this; } - UTF8& operator = ( const std::string& o ) + operator wxString () const; + + /// This one is not in std::string, and one wonders why... might be a solid + /// enough reason to remove it still. + operator char* () const { - std::string::operator = ( o ); - return *this; + return (char*) c_str(); } - operator wxString () const - { - // @todo: should not be inline. - return wxString( c_str(), wxConvUTF8 ); - } - - static int uni_forward( unsigned char* it, uint32_t* result ) + /** + * Function uni_forward + * advances over a UTF8 encoded multibyte character, capturing the unicode + * character as it goes, and returning the number of bytes consumed. + * + * @param aSequence is the UTF8 byte sequence. + * @param aResult is where to put the unicode character. + */ + static int uni_forward( unsigned char* aSequence, unsigned* aResult ) { // @todo: have this read UTF8 characters into result, not bytes. // What's here now is scaffolding, reading single byte characters only. - *result = *it; + *aResult = *aSequence; return 1; } @@ -71,37 +72,40 @@ public: * class uni_iter * is a non-mutable iterator that walks through code points in the UTF8 encoded * string. The normal ++(), ++(int), ->(), and *() operators are all supported and - * they return a uint32_t holding the unicode character appropriate for respective + * they return a unsigned holding the unicode character appropriate for respective * operation. */ class uni_iter { + friend class UTF8; + unsigned char* it; - public: uni_iter( const char* start ) : it( (unsigned char*) start ) { + assert( sizeof(unsigned) >= 4 ); } + public: + /// pre-increment and return unicode at new position - uint32_t operator++() + unsigned operator++() { - uint32_t result; + unsigned result; // advance, and toss the result it += uni_forward( it, &result ); // get the next result, but do not advance: uni_forward( it, &result ); - return result; } /// post-increment and return unicode at initial position - uint32_t operator++( int ) + unsigned operator++( int ) { - uint32_t result; + unsigned result; // grab the result and advance. it += uni_forward( it, &result ); @@ -109,9 +113,9 @@ public: } /// return unicode at current position - uint32_t operator->() const + unsigned operator->() const { - uint32_t result; + unsigned result; // grab the result, do not advance uni_forward( it, &result ); @@ -119,9 +123,9 @@ public: } /// return unicode at current position - uint32_t operator*() const + unsigned operator*() const { - uint32_t result; + unsigned result; // grab the result, do not advance uni_forward( it, &result ); @@ -136,11 +140,19 @@ public: bool operator>=( const uni_iter& other ) const { return it >= other.it; } }; + /** + * Function ubegin + * returns a @a uni_iter initialized to the start of this UTF8 byte sequence. + */ uni_iter ubegin() const { return uni_iter( data() ); } + /** + * Function uend + * returns a @a uni_iter initialized to the end of this UTF8 byte sequence. + */ uni_iter uend() const { return uni_iter( data() + size() ); @@ -148,9 +160,11 @@ public: }; -wxString aFunctionTaking_wxString( const wxString& wx ) +wxString wxFunctionTaking_wxString( const wxString& wx ) { - printf( "%s: '%s'\n", __func__, UTF8( wx ).c_str() ); + printf( "%s:'%s'\n", __func__, (char*) UTF8( wx ) ); + printf( "%s:'%s'\n", __func__, (const char*) UTF8( wx ) ); + printf( "%s:'%s'\n", __func__, UTF8( wx ).c_str() ); return wx; } @@ -158,9 +172,11 @@ wxString aFunctionTaking_wxString( const wxString& wx ) int main() { - UTF8 u1 = "output"; std::string str = "input"; - wxString wx = wxT( "input" ); + UTF8 u1 = "initial"; + wxString wx = wxT( "input2" ); + + printf( "u1:'%s'\n", u1.c_str() ); u1 = str; @@ -170,25 +186,49 @@ int main() u2 += 'X'; - printf( "utf2:'%s'\n", u2.c_str() ); + printf( "u2:'%s'\n", u2.c_str() ); // key accomplishments here: // 1) passing a UTF8 to a function which normally takes a wxString. // 2) return a wxString back into a UTF8. - UTF8 result = aFunctionTaking_wxString( u2 ); + UTF8 result = wxFunctionTaking_wxString( u2 ); printf( "result:'%s'\n", result.c_str() ); // test the unicode iterator: for( UTF8::uni_iter it = u2.ubegin(); it != u2.uend(); ) { + // test post-increment: printf( " _%c_", it++ ); - // after UTF7::uni_forward() is implemented, it++ %c is no longer useable. + // after UTF8::uni_forward() is implemented, %c is no longer useable. // printf( " _%02x_", it++ ); } + printf( "\n" ); return 0; } + +// These to go into a library *.cpp, they are not inlined so that code space +// is saved creating the intermediate objects and referencing wxConvUTF8. + + +UTF8::UTF8( const wxString& o ) : + std::string( (const char*) o.utf8_str() ) +{ +} + + +UTF8::operator wxString () const +{ + return wxString( c_str(), wxConvUTF8 ); +} + + +UTF8& UTF8::operator=( const wxString& o ) +{ + std::string::operator=( (const char*) o.utf8_str() ); + return *this; +}