ZXStringUtils.m 5.8KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188
  1. /*
  2. * Copyright 2012 ZXing authors
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #import "ZXByteArray.h"
  17. #import "ZXDecodeHints.h"
  18. #import "ZXStringUtils.h"
  19. @implementation ZXStringUtils
  20. + (NSStringEncoding)guessEncoding:(ZXByteArray *)bytes hints:(ZXDecodeHints *)hints {
  21. NSStringEncoding systemEncoding = CFStringConvertEncodingToNSStringEncoding(CFStringGetSystemEncoding());
  22. BOOL assumeShiftJIS = systemEncoding == NSShiftJISStringEncoding || systemEncoding == NSJapaneseEUCStringEncoding;
  23. if (hints != nil) {
  24. NSStringEncoding encoding = hints.encoding;
  25. if (encoding > 0) {
  26. return encoding;
  27. }
  28. }
  29. // For now, merely tries to distinguish ISO-8859-1, UTF-8 and Shift_JIS,
  30. // which should be by far the most common encodings.
  31. int length = bytes.length;
  32. BOOL canBeISO88591 = YES;
  33. BOOL canBeShiftJIS = YES;
  34. BOOL canBeUTF8 = YES;
  35. int utf8BytesLeft = 0;
  36. //int utf8LowChars = 0;
  37. int utf2BytesChars = 0;
  38. int utf3BytesChars = 0;
  39. int utf4BytesChars = 0;
  40. int sjisBytesLeft = 0;
  41. //int sjisLowChars = 0;
  42. int sjisKatakanaChars = 0;
  43. //int sjisDoubleBytesChars = 0;
  44. int sjisCurKatakanaWordLength = 0;
  45. int sjisCurDoubleBytesWordLength = 0;
  46. int sjisMaxKatakanaWordLength = 0;
  47. int sjisMaxDoubleBytesWordLength = 0;
  48. //int isoLowChars = 0;
  49. //int isoHighChars = 0;
  50. int isoHighOther = 0;
  51. BOOL utf8bom = length > 3 &&
  52. bytes.array[0] == (int8_t) 0xEF &&
  53. bytes.array[1] == (int8_t) 0xBB &&
  54. bytes.array[2] == (int8_t) 0xBF;
  55. for (int i = 0;
  56. i < length && (canBeISO88591 || canBeShiftJIS || canBeUTF8);
  57. i++) {
  58. int value = bytes.array[i] & 0xFF;
  59. // UTF-8 stuff
  60. if (canBeUTF8) {
  61. if (utf8BytesLeft > 0) {
  62. if ((value & 0x80) == 0) {
  63. canBeUTF8 = NO;
  64. } else {
  65. utf8BytesLeft--;
  66. }
  67. } else if ((value & 0x80) != 0) {
  68. if ((value & 0x40) == 0) {
  69. canBeUTF8 = NO;
  70. } else {
  71. utf8BytesLeft++;
  72. if ((value & 0x20) == 0) {
  73. utf2BytesChars++;
  74. } else {
  75. utf8BytesLeft++;
  76. if ((value & 0x10) == 0) {
  77. utf3BytesChars++;
  78. } else {
  79. utf8BytesLeft++;
  80. if ((value & 0x08) == 0) {
  81. utf4BytesChars++;
  82. } else {
  83. canBeUTF8 = NO;
  84. }
  85. }
  86. }
  87. }
  88. } //else {
  89. //utf8LowChars++;
  90. //}
  91. }
  92. // ISO-8859-1 stuff
  93. if (canBeISO88591) {
  94. if (value > 0x7F && value < 0xA0) {
  95. canBeISO88591 = NO;
  96. } else if (value > 0x9F) {
  97. if (value < 0xC0 || value == 0xD7 || value == 0xF7) {
  98. isoHighOther++;
  99. } //else {
  100. //isoHighChars++;
  101. //}
  102. } //else {
  103. //isoLowChars++;
  104. //}
  105. }
  106. // Shift_JIS stuff
  107. if (canBeShiftJIS) {
  108. if (sjisBytesLeft > 0) {
  109. if (value < 0x40 || value == 0x7F || value > 0xFC) {
  110. canBeShiftJIS = NO;
  111. } else {
  112. sjisBytesLeft--;
  113. }
  114. } else if (value == 0x80 || value == 0xA0 || value > 0xEF) {
  115. canBeShiftJIS = NO;
  116. } else if (value > 0xA0 && value < 0xE0) {
  117. sjisKatakanaChars++;
  118. sjisCurDoubleBytesWordLength = 0;
  119. sjisCurKatakanaWordLength++;
  120. if (sjisCurKatakanaWordLength > sjisMaxKatakanaWordLength) {
  121. sjisMaxKatakanaWordLength = sjisCurKatakanaWordLength;
  122. }
  123. } else if (value > 0x7F) {
  124. sjisBytesLeft++;
  125. //sjisDoubleBytesChars++;
  126. sjisCurKatakanaWordLength = 0;
  127. sjisCurDoubleBytesWordLength++;
  128. if (sjisCurDoubleBytesWordLength > sjisMaxDoubleBytesWordLength) {
  129. sjisMaxDoubleBytesWordLength = sjisCurDoubleBytesWordLength;
  130. }
  131. } else {
  132. //sjisLowChars++;
  133. sjisCurKatakanaWordLength = 0;
  134. sjisCurDoubleBytesWordLength = 0;
  135. }
  136. }
  137. }
  138. if (canBeUTF8 && utf8BytesLeft > 0) {
  139. canBeUTF8 = NO;
  140. }
  141. if (canBeShiftJIS && sjisBytesLeft > 0) {
  142. canBeShiftJIS = NO;
  143. }
  144. // Easy -- if there is BOM or at least 1 valid not-single byte character (and no evidence it can't be UTF-8), done
  145. if (canBeUTF8 && (utf8bom || utf2BytesChars + utf3BytesChars + utf4BytesChars > 0)) {
  146. return NSUTF8StringEncoding;
  147. }
  148. // Easy -- if assuming Shift_JIS or at least 3 valid consecutive not-ascii characters (and no evidence it can't be), done
  149. if (canBeShiftJIS && (assumeShiftJIS || sjisMaxKatakanaWordLength >= 3 || sjisMaxDoubleBytesWordLength >= 3)) {
  150. return NSShiftJISStringEncoding;
  151. }
  152. // Distinguishing Shift_JIS and ISO-8859-1 can be a little tough for short words. The crude heuristic is:
  153. // - If we saw
  154. // - only two consecutive katakana chars in the whole text, or
  155. // - at least 10% of bytes that could be "upper" not-alphanumeric Latin1,
  156. // - then we conclude Shift_JIS, else ISO-8859-1
  157. if (canBeISO88591 && canBeShiftJIS) {
  158. return (sjisMaxKatakanaWordLength == 2 && sjisKatakanaChars == 2) || isoHighOther * 10 >= length
  159. ? NSShiftJISStringEncoding : NSISOLatin1StringEncoding;
  160. }
  161. // Otherwise, try in order ISO-8859-1, Shift JIS, UTF-8 and fall back to default platform encoding
  162. if (canBeISO88591) {
  163. return NSISOLatin1StringEncoding;
  164. }
  165. if (canBeShiftJIS) {
  166. return NSShiftJISStringEncoding;
  167. }
  168. if (canBeUTF8) {
  169. return NSUTF8StringEncoding;
  170. }
  171. // Otherwise, we take a wild guess with platform encoding
  172. return systemEncoding;
  173. }
  174. @end