Main Page | Class Hierarchy | Alphabetical List | Data Structures | Directories | File List | Data Fields | Globals | Related Pages

normlzr.h

00001 /*
00002  ********************************************************************
00003  * COPYRIGHT: 
00004  * Copyright (c) 1996-2001, International Business Machines Corporation and
00005  * others. All Rights Reserved.
00006  ********************************************************************
00007  */
00008 
00009 #ifndef NORMLZR_H
00010 #define NORMLZR_H
00011 
00012 #include "unicode/utypes.h"
00013 #include "unicode/unistr.h"
00014 #include "unicode/chariter.h"
00015 #include "unicode/unorm.h"
00016 
00017 struct UCharIterator;
00018 typedef struct UCharIterator UCharIterator;
00019 
00020 U_NAMESPACE_BEGIN
00111 class U_COMMON_API Normalizer
00112 {
00113 public:
00119   enum {
00120       DONE=0xffff
00121   };
00122 
00123   // Constructors
00124 
00135   Normalizer(const UnicodeString& str, UNormalizationMode mode);
00136     
00148   Normalizer(const UChar* str, int32_t length, UNormalizationMode mode);
00149 
00160   Normalizer(const CharacterIterator& iter, UNormalizationMode mode);
00161 
00166   Normalizer(const Normalizer& copy);
00167 
00172   ~Normalizer();
00173 
00174 
00175   //-------------------------------------------------------------------------
00176   // Static utility methods
00177   //-------------------------------------------------------------------------
00178 
00196   static void normalize(const UnicodeString& source,
00197                         UNormalizationMode mode, int32_t options,
00198                         UnicodeString& result,
00199                         UErrorCode &status);
00200 
00222   static void compose(const UnicodeString& source,
00223                       UBool compat, int32_t options,
00224                       UnicodeString& result,
00225                       UErrorCode &status);
00226 
00249   static void decompose(const UnicodeString& source,
00250                         UBool compat, int32_t options,
00251                         UnicodeString& result,
00252                         UErrorCode &status);
00253 
00272   static UNormalizationCheckResult
00273   quickCheck(const UnicodeString &source, UNormalizationMode mode, UErrorCode &status);
00274 
00275   /*
00276    * Concatenate normalized strings, making sure that the result is normalized as well.
00277    *
00278    * If both the left and the right strings are in
00279    * the normalization form according to "mode",
00280    * then the result will be
00281    *
00282    * \code
00283    *     dest=normalize(left+right, mode)
00284    * \endcode
00285    *
00286    * For details see unorm_concatenate in unorm.h.
00287    *
00288    * @param left Left source string.
00289    * @param right Right source string.
00290    * @param dest The output string.
00291    * @param mode The normalization mode.
00292    * @param options A bit set of normalization options.
00293    * @param pErrorCode ICU error code in/out parameter.
00294    *                   Must fulfill U_SUCCESS before the function call.
00295    * @return result
00296    *
00297    * @see unorm_concatenate
00298    * @see normalize
00299    * @see unorm_next
00300    * @see unorm_previous
00301    *
00302    * @draft ICU 2.1
00303    */
00304   static UnicodeString &
00305   concatenate(UnicodeString &left, UnicodeString &right,
00306               UnicodeString &result,
00307               UNormalizationMode mode, int32_t options,
00308               UErrorCode &errorCode);
00309 
00310   //-------------------------------------------------------------------------
00311   // Iteration API
00312   //-------------------------------------------------------------------------
00313   
00322   UChar32              current(void);
00323 
00332   UChar32              first(void);
00333 
00342   UChar32              last(void);
00343 
00352   UChar32              next(void);
00353 
00362   UChar32              previous(void);
00363 
00383   UChar32              setIndex(int32_t index);
00384 
00394   void                 setIndexOnly(int32_t index);
00395 
00401   void                reset(void);
00402 
00417   int32_t            getIndex(void) const;
00418 
00427   int32_t            startIndex(void) const;
00428 
00439   int32_t            endIndex(void) const;
00440 
00449   UBool        operator==(const Normalizer& that) const;
00450 
00459   inline UBool        operator!=(const Normalizer& that) const;
00460 
00467   Normalizer*        clone(void) const;
00468 
00475   int32_t                hashCode(void) const;
00476 
00477   //-------------------------------------------------------------------------
00478   // Property access methods
00479   //-------------------------------------------------------------------------
00480 
00496   void setMode(UNormalizationMode newMode);
00497 
00508   UNormalizationMode getUMode(void) const;
00509 
00526   void setOption(int32_t option, 
00527          UBool value);
00528 
00539   UBool getOption(int32_t option) const;
00540 
00549   void setText(const UnicodeString& newText, 
00550            UErrorCode &status);
00551 
00560   void setText(const CharacterIterator& newText, 
00561            UErrorCode &status);
00562 
00572   void setText(const UChar* newText,
00573                     int32_t length,
00574             UErrorCode &status);
00581   void            getText(UnicodeString&  result);
00582 
00583   //-------------------------------------------------------------------------
00584   // Deprecated APIs
00585   //-------------------------------------------------------------------------
00586 
00591   enum {
00592     COMPAT_BIT         = 1,
00593     DECOMP_BIT         = 2,
00594     COMPOSE_BIT        = 4,
00595     FCD_BIT            = 8
00596   };
00597 
00602   enum EMode {
00616     NO_OP         = 0,
00617     
00633     COMPOSE         = COMPOSE_BIT,
00634 
00650     COMPOSE_COMPAT     = COMPOSE_BIT | COMPAT_BIT,
00651 
00667     DECOMP         = DECOMP_BIT,
00668 
00684     DECOMP_COMPAT     = DECOMP_BIT | COMPAT_BIT,
00685 
00689     FCD = FCD_BIT
00690   };
00691 
00693   enum {
00712     IGNORE_HANGUL     = 0x001
00713   };
00714 
00725   Normalizer(const UnicodeString& str, 
00726          EMode mode);
00727     
00746   Normalizer(const UnicodeString& str, 
00747          EMode mode, 
00748          int32_t opt);
00749 
00761   Normalizer(const UChar* str,
00762          int32_t length,
00763          EMode mode);
00764 
00780   Normalizer(const UChar* str,
00781          int32_t length,
00782          EMode mode,
00783          int32_t option);
00784 
00795   Normalizer(const CharacterIterator& iter, 
00796          EMode mode);
00797 
00813   Normalizer(const CharacterIterator& iter, 
00814          EMode mode, 
00815          int32_t opt);
00816 
00837   inline static void
00838   normalize(const UnicodeString& source, 
00839             EMode mode, 
00840             int32_t options,
00841             UnicodeString& result, 
00842             UErrorCode &status);
00843 
00860   inline static UNormalizationCheckResult
00861   quickCheck(const UnicodeString& source,
00862              EMode                mode, 
00863              UErrorCode&          status);
00864 
00872   inline static UNormalizationMode getUNormalizationMode(EMode mode, 
00873                                                   UErrorCode& status);
00874 
00882   inline static EMode getNormalizerEMode(UNormalizationMode mode, 
00883                                          UErrorCode& status);
00884 
00911   inline void setMode(EMode newMode);
00912 
00919   inline EMode getMode(void) const;
00920 
00921 private:
00922   //-------------------------------------------------------------------------
00923   // Private functions
00924   //-------------------------------------------------------------------------
00925 
00926   // Private utility methods for iteration
00927   // For documentation, see the source code
00928   UBool nextNormalize();
00929   UBool previousNormalize();
00930 
00931   void    init(CharacterIterator *iter);
00932   void    clearBuffer(void);
00933 
00934   // Helper, without UErrorCode, for easier transitional code
00935   // remove after 2002-sep-30 with EMode etc.
00936   inline static UNormalizationMode getUMode(EMode mode);
00937 
00938   //-------------------------------------------------------------------------
00939   // Private data
00940   //-------------------------------------------------------------------------
00941 
00942   UNormalizationMode  fUMode;
00943   int32_t             fOptions;
00944 
00945   // The input text and our position in it
00946   UCharIterator       *text;
00947 
00948   // The normalization buffer is the result of normalization
00949   // of the source in [currentIndex..nextIndex[ .
00950   int32_t         currentIndex, nextIndex;
00951 
00952   // A buffer for holding intermediate results
00953   UnicodeString       buffer;
00954   int32_t         bufferPos;
00955 };
00956 
00957 //-------------------------------------------------------------------------
00958 // Inline implementations
00959 //-------------------------------------------------------------------------
00960 
00961 inline UBool
00962 Normalizer::operator!= (const Normalizer& other) const
00963 { return ! operator==(other); }
00964 
00965 inline void 
00966 Normalizer::normalize(const UnicodeString& source, 
00967                       EMode mode, int32_t options,
00968                       UnicodeString& result, 
00969                       UErrorCode &status) {
00970   normalize(source, getUNormalizationMode(mode, status), options, result, status);
00971 }
00972 
00973 inline UNormalizationCheckResult
00974 Normalizer::quickCheck(const UnicodeString& source,
00975                        EMode mode, 
00976                        UErrorCode &status) {
00977   return quickCheck(source, getUNormalizationMode(mode, status), status);
00978 }
00979 
00980 inline void
00981 Normalizer::setMode(EMode newMode) {
00982   UErrorCode status = U_ZERO_ERROR;
00983   fUMode = getUNormalizationMode(newMode, status);
00984 }
00985 
00986 inline Normalizer::EMode
00987 Normalizer::getMode() const {
00988   UErrorCode status = U_ZERO_ERROR;
00989   return getNormalizerEMode(fUMode, status);
00990 }
00991 
00992 inline UNormalizationMode Normalizer::getUNormalizationMode(
00993                                    Normalizer::EMode  mode, UErrorCode &status)
00994 {
00995   if (U_SUCCESS(status))
00996   { 
00997     switch (mode)
00998     {
00999     case Normalizer::NO_OP : 
01000       return UNORM_NONE;
01001     case Normalizer::COMPOSE :
01002       return UNORM_NFC;
01003     case Normalizer::COMPOSE_COMPAT :
01004       return UNORM_NFKC;
01005     case Normalizer::DECOMP :
01006       return UNORM_NFD;
01007     case Normalizer::DECOMP_COMPAT :
01008       return UNORM_NFKD;
01009     case Normalizer::FCD:
01010       return UNORM_FCD;
01011     default : 
01012       status = U_ILLEGAL_ARGUMENT_ERROR; 
01013     }
01014   }
01015   return UNORM_DEFAULT;
01016 }
01017 
01018 inline UNormalizationMode
01019 Normalizer::getUMode(Normalizer::EMode mode) {
01020   switch(mode) {
01021   case Normalizer::NO_OP : 
01022     return UNORM_NONE;
01023   case Normalizer::COMPOSE :
01024     return UNORM_NFC;
01025   case Normalizer::COMPOSE_COMPAT :
01026     return UNORM_NFKC;
01027   case Normalizer::DECOMP :
01028     return UNORM_NFD;
01029   case Normalizer::DECOMP_COMPAT :
01030     return UNORM_NFKD;
01031   case Normalizer::FCD:
01032     return UNORM_FCD;
01033   default : 
01034     return UNORM_DEFAULT;
01035   }
01036 }
01037 
01038 inline Normalizer::EMode Normalizer::getNormalizerEMode(
01039                                   UNormalizationMode mode, UErrorCode &status)
01040 {
01041   if (U_SUCCESS(status))
01042   {
01043     switch (mode)
01044     {
01045     case UNORM_NONE :
01046       return Normalizer::NO_OP;
01047     case UNORM_NFD :
01048       return Normalizer::DECOMP;
01049     case UNORM_NFKD :
01050       return Normalizer::DECOMP_COMPAT;
01051     case UNORM_NFC :
01052       return Normalizer::COMPOSE;
01053     case UNORM_NFKC :
01054       return Normalizer::COMPOSE_COMPAT;
01055     case UNORM_FCD:
01056       return Normalizer::FCD;
01057     default : 
01058       status = U_ILLEGAL_ARGUMENT_ERROR; 
01059     }
01060   }
01061   return Normalizer::DECOMP_COMPAT;
01062 }
01063 
01064 U_NAMESPACE_END
01065 #endif // _NORMLZR

Generated on Sun May 22 18:53:55 2005 for ICU 2.1 by  doxygen 1.4.2