00001
00002
00003
00004
00005
00006
00007
00008
00009 #ifndef NORMLZR_H
00010 #define NORMLZR_H
00011
00012 #include "unicode/utypes.h"
00013 #include "unicode/unistr.h"
00014 #include "unicode/chariter.h"
00015 #include "unicode/unorm.h"
00016
00017 struct UCharIterator;
00018 typedef struct UCharIterator UCharIterator;
00019
00020 U_NAMESPACE_BEGIN
00111 class U_COMMON_API Normalizer
00112 {
00113 public:
00119 enum {
00120 DONE=0xffff
00121 };
00122
00123
00124
00135 Normalizer(const UnicodeString& str, UNormalizationMode mode);
00136
00148 Normalizer(const UChar* str, int32_t length, UNormalizationMode mode);
00149
00160 Normalizer(const CharacterIterator& iter, UNormalizationMode mode);
00161
00166 Normalizer(const Normalizer& copy);
00167
00172 ~Normalizer();
00173
00174
00175
00176
00177
00178
00196 static void normalize(const UnicodeString& source,
00197 UNormalizationMode mode, int32_t options,
00198 UnicodeString& result,
00199 UErrorCode &status);
00200
00222 static void compose(const UnicodeString& source,
00223 UBool compat, int32_t options,
00224 UnicodeString& result,
00225 UErrorCode &status);
00226
00249 static void decompose(const UnicodeString& source,
00250 UBool compat, int32_t options,
00251 UnicodeString& result,
00252 UErrorCode &status);
00253
00272 static UNormalizationCheckResult
00273 quickCheck(const UnicodeString &source, UNormalizationMode mode, UErrorCode &status);
00274
00275
00276
00277
00278
00279
00280
00281
00282
00283
00284
00285
00286
00287
00288
00289
00290
00291
00292
00293
00294
00295
00296
00297
00298
00299
00300
00301
00302
00303
00304 static UnicodeString &
00305 concatenate(UnicodeString &left, UnicodeString &right,
00306 UnicodeString &result,
00307 UNormalizationMode mode, int32_t options,
00308 UErrorCode &errorCode);
00309
00310
00311
00312
00313
00322 UChar32 current(void);
00323
00332 UChar32 first(void);
00333
00342 UChar32 last(void);
00343
00352 UChar32 next(void);
00353
00362 UChar32 previous(void);
00363
00383 UChar32 setIndex(int32_t index);
00384
00394 void setIndexOnly(int32_t index);
00395
00401 void reset(void);
00402
00417 int32_t getIndex(void) const;
00418
00427 int32_t startIndex(void) const;
00428
00439 int32_t endIndex(void) const;
00440
00449 UBool operator==(const Normalizer& that) const;
00450
00459 inline UBool operator!=(const Normalizer& that) const;
00460
00467 Normalizer* clone(void) const;
00468
00475 int32_t hashCode(void) const;
00476
00477
00478
00479
00480
00496 void setMode(UNormalizationMode newMode);
00497
00508 UNormalizationMode getUMode(void) const;
00509
00526 void setOption(int32_t option,
00527 UBool value);
00528
00539 UBool getOption(int32_t option) const;
00540
00549 void setText(const UnicodeString& newText,
00550 UErrorCode &status);
00551
00560 void setText(const CharacterIterator& newText,
00561 UErrorCode &status);
00562
00572 void setText(const UChar* newText,
00573 int32_t length,
00574 UErrorCode &status);
00581 void getText(UnicodeString& result);
00582
00583
00584
00585
00586
00591 enum {
00592 COMPAT_BIT = 1,
00593 DECOMP_BIT = 2,
00594 COMPOSE_BIT = 4,
00595 FCD_BIT = 8
00596 };
00597
00602 enum EMode {
00616 NO_OP = 0,
00617
00633 COMPOSE = COMPOSE_BIT,
00634
00650 COMPOSE_COMPAT = COMPOSE_BIT | COMPAT_BIT,
00651
00667 DECOMP = DECOMP_BIT,
00668
00684 DECOMP_COMPAT = DECOMP_BIT | COMPAT_BIT,
00685
00689 FCD = FCD_BIT
00690 };
00691
00693 enum {
00712 IGNORE_HANGUL = 0x001
00713 };
00714
00725 Normalizer(const UnicodeString& str,
00726 EMode mode);
00727
00746 Normalizer(const UnicodeString& str,
00747 EMode mode,
00748 int32_t opt);
00749
00761 Normalizer(const UChar* str,
00762 int32_t length,
00763 EMode mode);
00764
00780 Normalizer(const UChar* str,
00781 int32_t length,
00782 EMode mode,
00783 int32_t option);
00784
00795 Normalizer(const CharacterIterator& iter,
00796 EMode mode);
00797
00813 Normalizer(const CharacterIterator& iter,
00814 EMode mode,
00815 int32_t opt);
00816
00837 inline static void
00838 normalize(const UnicodeString& source,
00839 EMode mode,
00840 int32_t options,
00841 UnicodeString& result,
00842 UErrorCode &status);
00843
00860 inline static UNormalizationCheckResult
00861 quickCheck(const UnicodeString& source,
00862 EMode mode,
00863 UErrorCode& status);
00864
00872 inline static UNormalizationMode getUNormalizationMode(EMode mode,
00873 UErrorCode& status);
00874
00882 inline static EMode getNormalizerEMode(UNormalizationMode mode,
00883 UErrorCode& status);
00884
00911 inline void setMode(EMode newMode);
00912
00919 inline EMode getMode(void) const;
00920
00921 private:
00922
00923
00924
00925
00926
00927
00928 UBool nextNormalize();
00929 UBool previousNormalize();
00930
00931 void init(CharacterIterator *iter);
00932 void clearBuffer(void);
00933
00934
00935
00936 inline static UNormalizationMode getUMode(EMode mode);
00937
00938
00939
00940
00941
00942 UNormalizationMode fUMode;
00943 int32_t fOptions;
00944
00945
00946 UCharIterator *text;
00947
00948
00949
00950 int32_t currentIndex, nextIndex;
00951
00952
00953 UnicodeString buffer;
00954 int32_t bufferPos;
00955 };
00956
00957
00958
00959
00960
00961 inline UBool
00962 Normalizer::operator!= (const Normalizer& other) const
00963 { return ! operator==(other); }
00964
00965 inline void
00966 Normalizer::normalize(const UnicodeString& source,
00967 EMode mode, int32_t options,
00968 UnicodeString& result,
00969 UErrorCode &status) {
00970 normalize(source, getUNormalizationMode(mode, status), options, result, status);
00971 }
00972
00973 inline UNormalizationCheckResult
00974 Normalizer::quickCheck(const UnicodeString& source,
00975 EMode mode,
00976 UErrorCode &status) {
00977 return quickCheck(source, getUNormalizationMode(mode, status), status);
00978 }
00979
00980 inline void
00981 Normalizer::setMode(EMode newMode) {
00982 UErrorCode status = U_ZERO_ERROR;
00983 fUMode = getUNormalizationMode(newMode, status);
00984 }
00985
00986 inline Normalizer::EMode
00987 Normalizer::getMode() const {
00988 UErrorCode status = U_ZERO_ERROR;
00989 return getNormalizerEMode(fUMode, status);
00990 }
00991
00992 inline UNormalizationMode Normalizer::getUNormalizationMode(
00993 Normalizer::EMode mode, UErrorCode &status)
00994 {
00995 if (U_SUCCESS(status))
00996 {
00997 switch (mode)
00998 {
00999 case Normalizer::NO_OP :
01000 return UNORM_NONE;
01001 case Normalizer::COMPOSE :
01002 return UNORM_NFC;
01003 case Normalizer::COMPOSE_COMPAT :
01004 return UNORM_NFKC;
01005 case Normalizer::DECOMP :
01006 return UNORM_NFD;
01007 case Normalizer::DECOMP_COMPAT :
01008 return UNORM_NFKD;
01009 case Normalizer::FCD:
01010 return UNORM_FCD;
01011 default :
01012 status = U_ILLEGAL_ARGUMENT_ERROR;
01013 }
01014 }
01015 return UNORM_DEFAULT;
01016 }
01017
01018 inline UNormalizationMode
01019 Normalizer::getUMode(Normalizer::EMode mode) {
01020 switch(mode) {
01021 case Normalizer::NO_OP :
01022 return UNORM_NONE;
01023 case Normalizer::COMPOSE :
01024 return UNORM_NFC;
01025 case Normalizer::COMPOSE_COMPAT :
01026 return UNORM_NFKC;
01027 case Normalizer::DECOMP :
01028 return UNORM_NFD;
01029 case Normalizer::DECOMP_COMPAT :
01030 return UNORM_NFKD;
01031 case Normalizer::FCD:
01032 return UNORM_FCD;
01033 default :
01034 return UNORM_DEFAULT;
01035 }
01036 }
01037
01038 inline Normalizer::EMode Normalizer::getNormalizerEMode(
01039 UNormalizationMode mode, UErrorCode &status)
01040 {
01041 if (U_SUCCESS(status))
01042 {
01043 switch (mode)
01044 {
01045 case UNORM_NONE :
01046 return Normalizer::NO_OP;
01047 case UNORM_NFD :
01048 return Normalizer::DECOMP;
01049 case UNORM_NFKD :
01050 return Normalizer::DECOMP_COMPAT;
01051 case UNORM_NFC :
01052 return Normalizer::COMPOSE;
01053 case UNORM_NFKC :
01054 return Normalizer::COMPOSE_COMPAT;
01055 case UNORM_FCD:
01056 return Normalizer::FCD;
01057 default :
01058 status = U_ILLEGAL_ARGUMENT_ERROR;
01059 }
01060 }
01061 return Normalizer::DECOMP_COMPAT;
01062 }
01063
01064 U_NAMESPACE_END
01065 #endif // _NORMLZR