| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322 | using System;using System.Globalization;using System.Linq;using System.Text;using System.Text.RegularExpressions;namespace Core.Util.Extension{    public static class ByteExtenstion    {        /// <summary>检测字节数组编码</summary>        public static Encoding Detect(this byte[] data)        {            // 探测BOM头            var encoding = DetectBOM(data);            if (encoding != null) return encoding;            return DetectInternal(data);        }       public  static Encoding DetectInternal(this byte[] data)        {            Encoding encoding = null;            // 最笨的办法尝试            var encs = new[] {                // 常用                Encoding.UTF8,                // 用户界面选择语言编码                Encoding.GetEncoding(CultureInfo.CurrentUICulture.TextInfo.ANSICodePage),                // 本地默认编码                Encoding.Default            };            encs = encs.Where(s => s != null).GroupBy(s => s.CodePage).Select(s => s.First()).ToArray();            // 如果有单字节编码,优先第一个非单字节的编码            foreach (var enc in encs)            {                if (IsMatch(data, enc))                {                    if (!enc.IsSingleByte) return enc;                    if (encoding == null) encoding = enc;                }            }            if (encoding != null) return encoding;            // 探测Unicode编码            encoding = DetectUnicode(data);            if (encoding != null) return encoding;            // 简单方法探测ASCII            encoding = DetectASCII(data);           return encoding;        }        /// <summary>检测BOM字节序</summary>        /// <param name="boms"></param>        /// <returns></returns>        // ReSharper disable once InconsistentNaming        public static Encoding DetectBOM(this byte[] boms)        {            if (boms.Length < 2) return null;            if (boms[0] == 0xff && boms[1] == 0xfe && (boms.Length < 4 || boms[2] != 0 || boms[3] != 0)) return Encoding.Unicode;            if (boms[0] == 0xfe && boms[1] == 0xff) return Encoding.BigEndianUnicode;            if (boms.Length < 3) return null;            if (boms[0] == 0xef && boms[1] == 0xbb && boms[2] == 0xbf) return Encoding.UTF8;            if (boms[0] == 0x2b && boms[1] == 0x2f && boms[2] == 0x76) return Encoding.UTF7;            if (boms.Length < 4) return null;            if (boms[0] == 0xff && boms[1] == 0xfe && boms[2] == 0 && boms[3] == 0) return Encoding.UTF32;            if (boms[0] == 0 && boms[1] == 0 && boms[2] == 0xfe && boms[3] == 0xff) return Encoding.GetEncoding(12001);            return null;        }        /// <summary>检测是否ASCII</summary>        // ReSharper disable once InconsistentNaming        static Encoding DetectASCII(byte[] data)        {            // 如果所有字节都小于128,则可以使用ASCII编码            if (data.Any(t => t >= 128))            {                return null;            }            return Encoding.ASCII;        }        static bool IsMatch(byte[] data, Encoding encoding)        {            if (encoding == null) encoding = Encoding.Default;            try            {                var str = encoding.GetString(data);                var buf = encoding.GetBytes(str);                // 考虑到噪声干扰,只要0.9                var score = buf.Length * 9 / 10;                var match = 0;                for (var i = 0; i < buf.Length; i++)                {                    if (data[i] == buf[i])                    {                        match++;                        if (match >= score) return true;                    }                }            }            catch (Exception ex)            {               // XTrace.WriteException(ex);                // ignored            }            return false;        }        /// <summary>启发式探测Unicode编码</summary>        static Encoding DetectUnicode(byte[] data)        {            var oddBinaryNullsInSample = 0;            var evenBinaryNullsInSample = 0;            var suspiciousUtf8SequenceCount = 0;            var suspiciousUtf8BytesTotal = 0;            // ReSharper disable once InconsistentNaming            var likelyUSASCIIBytesInSample = 0;            // Cycle through, keeping count of binary null positions, possible UTF-8            // sequences from upper ranges of Windows-1252, and probable US-ASCII            // character counts.            long pos = 0;            // ReSharper disable once InconsistentNaming            var skipUTF8Bytes = 0;            while (pos < data.Length)            {                // 二进制空分布                if (data[pos] == 0)                {                    if (pos % 2 == 0)                        evenBinaryNullsInSample++;                    else                        oddBinaryNullsInSample++;                }                // 可见 ASCII 字符                if (IsCommonASCII(data[pos]))                    likelyUSASCIIBytesInSample++;                // 类似UTF-8的可疑序列                if (skipUTF8Bytes == 0)                {                    int len = DetectSuspiciousUTF8SequenceLength(data, pos);                    if (len > 0)                    {                        suspiciousUtf8SequenceCount++;                        suspiciousUtf8BytesTotal += len;                        skipUTF8Bytes = len - 1;                    }                }                else                {                    skipUTF8Bytes--;                }                pos++;            }            // UTF-16            // LE 小端 在英语或欧洲环境,经常使用奇数个0(以0开始),而很少用偶数个0            // BE 大端 在英语或欧洲环境,经常使用偶数个0(以0开始),而很少用奇数个0            if (((evenBinaryNullsInSample * 2.0) / data.Length) < 0.2                && ((oddBinaryNullsInSample * 2.0) / data.Length) > 0.6                )                return Encoding.Unicode;            if (((oddBinaryNullsInSample * 2.0) / data.Length) < 0.2                && ((evenBinaryNullsInSample * 2.0) / data.Length) > 0.6                )                return Encoding.BigEndianUnicode;            // UTF-8            // 使用正则检测,参考http://www.w3.org/International/questions/qa-forms-utf-8            string potentiallyMangledString = Encoding.ASCII.GetString(data);            var reg = new Regex(@"\A("                + @"[\x09\x0A\x0D\x20-\x7E]"            // ASCII                + @"|[\xC2-\xDF][\x80-\xBF]"            // 不太长的2字节                + @"|\xE0[\xA0-\xBF][\x80-\xBF]"        // 排除太长                + @"|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}" // 连续的3字节                + @"|\xED[\x80-\x9F][\x80-\xBF]"        // 排除代理                + @"|\xF0[\x90-\xBF][\x80-\xBF]{2}"     // 1~3                + @"|[\xF1-\xF3][\x80-\xBF]{3}"         // 4~15                + @"|\xF4[\x80-\x8F][\x80-\xBF]{2}"     // 16                + @")*\z");            if (reg.IsMatch(potentiallyMangledString))            {                //Unfortunately, just the fact that it CAN be UTF-8 doesn't tell you much about probabilities.                //If all the characters are in the 0-127 range, no harm done, most western charsets are same as UTF-8 in these ranges.                //If some of the characters were in the upper range (western accented characters), however, they would likely be mangled to 2-byte by the UTF-8 encoding process.                // So, we need to play stats.                // The "Random" likelihood of any pair of randomly generated characters being one                // of these "suspicious" character sequences is:                // 128 / (256 * 256) = 0.2%.                //                // In western text data, that is SIGNIFICANTLY reduced - most text data stays in the <127                // character range, so we assume that more than 1 in 500,000 of these character                // sequences indicates UTF-8. The number 500,000 is completely arbitrary - so sue me.                //                // We can only assume these character sequences will be rare if we ALSO assume that this                // IS in fact western text - in which case the bulk of the UTF-8 encoded data (that is                // not already suspicious sequences) should be plain US-ASCII bytes. This, I                // arbitrarily decided, should be 80% (a random distribution, eg binary data, would yield                // approx 40%, so the chances of hitting this threshold by accident in random data are                // VERY low).                // 很不幸运,事实上,它仅仅可能是UTF-8。如果所有字符都在0~127范围,那是没有问题的,绝大部分西方字符在UTF-8都在这个范围。                // 然而如果部分字符在大写区域(西方口语字符),用UTF-8编码处理可能造成误伤。所以我们需要继续分析。                // 随机生成字符成为可疑序列的可能性是:128 / (256 * 256) = 0.2%                // 在西方文本数据,这要小得多,绝大部分文本数据停留在小于127的范围。所以我们假定在500000个字符中多余一个UTF-8字符                if ((suspiciousUtf8SequenceCount * 500000.0 / data.Length >= 1) // 可疑序列                    && (                           // 所有可疑情况,无法平率ASCII可能性                           data.Length - suspiciousUtf8BytesTotal == 0                           ||                           likelyUSASCIIBytesInSample * 1.0 / (data.Length - suspiciousUtf8BytesTotal) >= 0.8                       )                    )                    return Encoding.UTF8;            }            return null;        }        /// <summary>是否可见ASCII</summary>        /// <param name="bt"></param>        /// <returns></returns>        // ReSharper disable once InconsistentNaming        static bool IsCommonASCII(byte bt)        {            if (bt == 0x0A // 回车                || bt == 0x0D // 换行                || bt == 0x09 // 制表符                || (bt >= 0x20 && bt <= 0x2F) // 符号                || (bt >= 0x30 && bt <= 0x39) // 数字                || (bt >= 0x3A && bt <= 0x40) // 符号                || (bt >= 0x41 && bt <= 0x5A) // 大写字母                || (bt >= 0x5B && bt <= 0x60) // 符号                || (bt >= 0x61 && bt <= 0x7A) // 小写字母                || (bt >= 0x7B && bt <= 0x7E) // 符号                )                return true;            else                return false;        }        /// <summary>检测可能的UTF8序列长度</summary>        /// <param name="buf"></param>        /// <param name="pos"></param>        /// <returns></returns>        // ReSharper disable once InconsistentNaming        private static int DetectSuspiciousUTF8SequenceLength(byte[] buf, Int64 pos)        {            if (buf.Length > pos + 1)            {                var first = buf[pos];                var second = buf[pos + 1];                if (first == 0xC2)                {                    if (second == 0x81 || second == 0x8D || second == 0x8F || second == 0x90 || second == 0x9D || second >= 0xA0 && second <= 0xBF)                        return 2;                }                else if (first == 0xC3)                {                    if (second >= 0x80 && second <= 0xBF) return 2;                }                else if (first == 0xC5)                {                    if (second == 0x92 || second == 0x93 || second == 0xA0 || second == 0xA1 || second == 0xB8 || second == 0xBD || second == 0xBE)                        return 2;                }                else if (first == 0xC6)                {                    if (second == 0x92) return 2;                }                else if (first == 0xCB)                {                    if (second == 0x86 || second == 0x9C) return 2;                }                else if (buf.Length >= pos + 2 && first == 0xE2)                {                    var three = buf[pos + 2];                    if (second == 0x80)                    {                        if (three == 0x93 || three == 0x94 || three == 0x98 || three == 0x99 || three == 0x9A)                            return 3;                        if (three == 0x9C || three == 0x9D || three == 0x9E)                            return 3;                        if (three == 0xA0 || three == 0xA1 || three == 0xA2)                            return 3;                        if (three == 0xA6 || three == 0xB0 || three == 0xB9 || three == 0xBA)                            return 3;                    }                    else if (second == 0x82 && three == 0xAC || second == 0x84 && three == 0xA2)                        return 3;                }            }            return 0;        }    }}
 |