ByteExtenstion.cs 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322
  1. using System;
  2. using System.Globalization;
  3. using System.Linq;
  4. using System.Text;
  5. using System.Text.RegularExpressions;
  6. namespace Core.Util.Extension
  7. {
  8. public static class ByteExtenstion
  9. {
  10. /// <summary>检测字节数组编码</summary>
  11. public static Encoding Detect(this byte[] data)
  12. {
  13. // 探测BOM头
  14. var encoding = DetectBOM(data);
  15. if (encoding != null) return encoding;
  16. return DetectInternal(data);
  17. }
  18. public static Encoding DetectInternal(this byte[] data)
  19. {
  20. Encoding encoding = null;
  21. // 最笨的办法尝试
  22. var encs = new[] {
  23. // 常用
  24. Encoding.UTF8,
  25. // 用户界面选择语言编码
  26. Encoding.GetEncoding(CultureInfo.CurrentUICulture.TextInfo.ANSICodePage),
  27. // 本地默认编码
  28. Encoding.Default
  29. };
  30. encs = encs.Where(s => s != null).GroupBy(s => s.CodePage).Select(s => s.First()).ToArray();
  31. // 如果有单字节编码,优先第一个非单字节的编码
  32. foreach (var enc in encs)
  33. {
  34. if (IsMatch(data, enc))
  35. {
  36. if (!enc.IsSingleByte) return enc;
  37. if (encoding == null) encoding = enc;
  38. }
  39. }
  40. if (encoding != null) return encoding;
  41. // 探测Unicode编码
  42. encoding = DetectUnicode(data);
  43. if (encoding != null) return encoding;
  44. // 简单方法探测ASCII
  45. encoding = DetectASCII(data);
  46. return encoding;
  47. }
  48. /// <summary>检测BOM字节序</summary>
  49. /// <param name="boms"></param>
  50. /// <returns></returns>
  51. // ReSharper disable once InconsistentNaming
  52. public static Encoding DetectBOM(this byte[] boms)
  53. {
  54. if (boms.Length < 2) return null;
  55. if (boms[0] == 0xff && boms[1] == 0xfe && (boms.Length < 4 || boms[2] != 0 || boms[3] != 0)) return Encoding.Unicode;
  56. if (boms[0] == 0xfe && boms[1] == 0xff) return Encoding.BigEndianUnicode;
  57. if (boms.Length < 3) return null;
  58. if (boms[0] == 0xef && boms[1] == 0xbb && boms[2] == 0xbf) return Encoding.UTF8;
  59. if (boms[0] == 0x2b && boms[1] == 0x2f && boms[2] == 0x76) return Encoding.UTF7;
  60. if (boms.Length < 4) return null;
  61. if (boms[0] == 0xff && boms[1] == 0xfe && boms[2] == 0 && boms[3] == 0) return Encoding.UTF32;
  62. if (boms[0] == 0 && boms[1] == 0 && boms[2] == 0xfe && boms[3] == 0xff) return Encoding.GetEncoding(12001);
  63. return null;
  64. }
  65. /// <summary>检测是否ASCII</summary>
  66. // ReSharper disable once InconsistentNaming
  67. static Encoding DetectASCII(byte[] data)
  68. {
  69. // 如果所有字节都小于128,则可以使用ASCII编码
  70. if (data.Any(t => t >= 128))
  71. {
  72. return null;
  73. }
  74. return Encoding.ASCII;
  75. }
  76. static bool IsMatch(byte[] data, Encoding encoding)
  77. {
  78. if (encoding == null) encoding = Encoding.Default;
  79. try
  80. {
  81. var str = encoding.GetString(data);
  82. var buf = encoding.GetBytes(str);
  83. // 考虑到噪声干扰,只要0.9
  84. var score = buf.Length * 9 / 10;
  85. var match = 0;
  86. for (var i = 0; i < buf.Length; i++)
  87. {
  88. if (data[i] == buf[i])
  89. {
  90. match++;
  91. if (match >= score) return true;
  92. }
  93. }
  94. }
  95. catch (Exception ex)
  96. {
  97. // XTrace.WriteException(ex);
  98. // ignored
  99. }
  100. return false;
  101. }
  102. /// <summary>启发式探测Unicode编码</summary>
  103. static Encoding DetectUnicode(byte[] data)
  104. {
  105. var oddBinaryNullsInSample = 0;
  106. var evenBinaryNullsInSample = 0;
  107. var suspiciousUtf8SequenceCount = 0;
  108. var suspiciousUtf8BytesTotal = 0;
  109. // ReSharper disable once InconsistentNaming
  110. var likelyUSASCIIBytesInSample = 0;
  111. // Cycle through, keeping count of binary null positions, possible UTF-8
  112. // sequences from upper ranges of Windows-1252, and probable US-ASCII
  113. // character counts.
  114. long pos = 0;
  115. // ReSharper disable once InconsistentNaming
  116. var skipUTF8Bytes = 0;
  117. while (pos < data.Length)
  118. {
  119. // 二进制空分布
  120. if (data[pos] == 0)
  121. {
  122. if (pos % 2 == 0)
  123. evenBinaryNullsInSample++;
  124. else
  125. oddBinaryNullsInSample++;
  126. }
  127. // 可见 ASCII 字符
  128. if (IsCommonASCII(data[pos]))
  129. likelyUSASCIIBytesInSample++;
  130. // 类似UTF-8的可疑序列
  131. if (skipUTF8Bytes == 0)
  132. {
  133. int len = DetectSuspiciousUTF8SequenceLength(data, pos);
  134. if (len > 0)
  135. {
  136. suspiciousUtf8SequenceCount++;
  137. suspiciousUtf8BytesTotal += len;
  138. skipUTF8Bytes = len - 1;
  139. }
  140. }
  141. else
  142. {
  143. skipUTF8Bytes--;
  144. }
  145. pos++;
  146. }
  147. // UTF-16
  148. // LE 小端 在英语或欧洲环境,经常使用奇数个0(以0开始),而很少用偶数个0
  149. // BE 大端 在英语或欧洲环境,经常使用偶数个0(以0开始),而很少用奇数个0
  150. if (((evenBinaryNullsInSample * 2.0) / data.Length) < 0.2
  151. && ((oddBinaryNullsInSample * 2.0) / data.Length) > 0.6
  152. )
  153. return Encoding.Unicode;
  154. if (((oddBinaryNullsInSample * 2.0) / data.Length) < 0.2
  155. && ((evenBinaryNullsInSample * 2.0) / data.Length) > 0.6
  156. )
  157. return Encoding.BigEndianUnicode;
  158. // UTF-8
  159. // 使用正则检测,参考http://www.w3.org/International/questions/qa-forms-utf-8
  160. string potentiallyMangledString = Encoding.ASCII.GetString(data);
  161. var reg = new Regex(@"\A("
  162. + @"[\x09\x0A\x0D\x20-\x7E]" // ASCII
  163. + @"|[\xC2-\xDF][\x80-\xBF]" // 不太长的2字节
  164. + @"|\xE0[\xA0-\xBF][\x80-\xBF]" // 排除太长
  165. + @"|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}" // 连续的3字节
  166. + @"|\xED[\x80-\x9F][\x80-\xBF]" // 排除代理
  167. + @"|\xF0[\x90-\xBF][\x80-\xBF]{2}" // 1~3
  168. + @"|[\xF1-\xF3][\x80-\xBF]{3}" // 4~15
  169. + @"|\xF4[\x80-\x8F][\x80-\xBF]{2}" // 16
  170. + @")*\z");
  171. if (reg.IsMatch(potentiallyMangledString))
  172. {
  173. //Unfortunately, just the fact that it CAN be UTF-8 doesn't tell you much about probabilities.
  174. //If all the characters are in the 0-127 range, no harm done, most western charsets are same as UTF-8 in these ranges.
  175. //If some of the characters were in the upper range (western accented characters), however, they would likely be mangled to 2-byte by the UTF-8 encoding process.
  176. // So, we need to play stats.
  177. // The "Random" likelihood of any pair of randomly generated characters being one
  178. // of these "suspicious" character sequences is:
  179. // 128 / (256 * 256) = 0.2%.
  180. //
  181. // In western text data, that is SIGNIFICANTLY reduced - most text data stays in the <127
  182. // character range, so we assume that more than 1 in 500,000 of these character
  183. // sequences indicates UTF-8. The number 500,000 is completely arbitrary - so sue me.
  184. //
  185. // We can only assume these character sequences will be rare if we ALSO assume that this
  186. // IS in fact western text - in which case the bulk of the UTF-8 encoded data (that is
  187. // not already suspicious sequences) should be plain US-ASCII bytes. This, I
  188. // arbitrarily decided, should be 80% (a random distribution, eg binary data, would yield
  189. // approx 40%, so the chances of hitting this threshold by accident in random data are
  190. // VERY low).
  191. // 很不幸运,事实上,它仅仅可能是UTF-8。如果所有字符都在0~127范围,那是没有问题的,绝大部分西方字符在UTF-8都在这个范围。
  192. // 然而如果部分字符在大写区域(西方口语字符),用UTF-8编码处理可能造成误伤。所以我们需要继续分析。
  193. // 随机生成字符成为可疑序列的可能性是:128 / (256 * 256) = 0.2%
  194. // 在西方文本数据,这要小得多,绝大部分文本数据停留在小于127的范围。所以我们假定在500000个字符中多余一个UTF-8字符
  195. if ((suspiciousUtf8SequenceCount * 500000.0 / data.Length >= 1) // 可疑序列
  196. && (
  197. // 所有可疑情况,无法平率ASCII可能性
  198. data.Length - suspiciousUtf8BytesTotal == 0
  199. ||
  200. likelyUSASCIIBytesInSample * 1.0 / (data.Length - suspiciousUtf8BytesTotal) >= 0.8
  201. )
  202. )
  203. return Encoding.UTF8;
  204. }
  205. return null;
  206. }
  207. /// <summary>是否可见ASCII</summary>
  208. /// <param name="bt"></param>
  209. /// <returns></returns>
  210. // ReSharper disable once InconsistentNaming
  211. static bool IsCommonASCII(byte bt)
  212. {
  213. if (bt == 0x0A // 回车
  214. || bt == 0x0D // 换行
  215. || bt == 0x09 // 制表符
  216. || (bt >= 0x20 && bt <= 0x2F) // 符号
  217. || (bt >= 0x30 && bt <= 0x39) // 数字
  218. || (bt >= 0x3A && bt <= 0x40) // 符号
  219. || (bt >= 0x41 && bt <= 0x5A) // 大写字母
  220. || (bt >= 0x5B && bt <= 0x60) // 符号
  221. || (bt >= 0x61 && bt <= 0x7A) // 小写字母
  222. || (bt >= 0x7B && bt <= 0x7E) // 符号
  223. )
  224. return true;
  225. else
  226. return false;
  227. }
  228. /// <summary>检测可能的UTF8序列长度</summary>
  229. /// <param name="buf"></param>
  230. /// <param name="pos"></param>
  231. /// <returns></returns>
  232. // ReSharper disable once InconsistentNaming
  233. private static int DetectSuspiciousUTF8SequenceLength(byte[] buf, Int64 pos)
  234. {
  235. if (buf.Length > pos + 1)
  236. {
  237. var first = buf[pos];
  238. var second = buf[pos + 1];
  239. if (first == 0xC2)
  240. {
  241. if (second == 0x81 || second == 0x8D || second == 0x8F || second == 0x90 || second == 0x9D || second >= 0xA0 && second <= 0xBF)
  242. return 2;
  243. }
  244. else if (first == 0xC3)
  245. {
  246. if (second >= 0x80 && second <= 0xBF) return 2;
  247. }
  248. else if (first == 0xC5)
  249. {
  250. if (second == 0x92 || second == 0x93 || second == 0xA0 || second == 0xA1 || second == 0xB8 || second == 0xBD || second == 0xBE)
  251. return 2;
  252. }
  253. else if (first == 0xC6)
  254. {
  255. if (second == 0x92) return 2;
  256. }
  257. else if (first == 0xCB)
  258. {
  259. if (second == 0x86 || second == 0x9C) return 2;
  260. }
  261. else if (buf.Length >= pos + 2 && first == 0xE2)
  262. {
  263. var three = buf[pos + 2];
  264. if (second == 0x80)
  265. {
  266. if (three == 0x93 || three == 0x94 || three == 0x98 || three == 0x99 || three == 0x9A)
  267. return 3;
  268. if (three == 0x9C || three == 0x9D || three == 0x9E)
  269. return 3;
  270. if (three == 0xA0 || three == 0xA1 || three == 0xA2)
  271. return 3;
  272. if (three == 0xA6 || three == 0xB0 || three == 0xB9 || three == 0xBA)
  273. return 3;
  274. }
  275. else if (second == 0x82 && three == 0xAC || second == 0x84 && three == 0xA2)
  276. return 3;
  277. }
  278. }
  279. return 0;
  280. }
  281. }
  282. }