Delphi.int.ru — Портал программистов

Вход Регистрация | Забыли пароль?

Просмотр кода

Идентификатор: 9b1205f4 Описание: Код загружен: 12 июля 2011, 13:32 (min@y™)

{ Модуль работы с кодировками.
  © min@y™ (minay.tm@gmail.com)}
 
unit uEncoding;
 
interface
 
uses
  //============================== Модули проекта ==============================
  uLogForm,
  //=============================== Левые модули ===============================
  //=================== Системные модули и модули компонентов ==================
  SysUtils, Menus, Windows, Classes, Forms;
 
type
 TTranslateDirection = (tdNone,
                        tdAnsiToDos,
                        tdAnsiToKOI8,
                        tdAnsiToISO,
                        tdDosToAnsi,
                        tdDosToKOI8,
                        tdDosToISO,
                        tdKOI8ToAnsi,
                        tdKOI8ToDos,
                        tdKOI8ToISO,
                        tdISOToAnsi,
                        tdISOToDos,
                        tdISOToKOI8);
 
 TCodePage = array[$80..$FF] of char;
 
 
 
type
  // Тип кодировки
  TEncodingType = (etWIN, etDOS, etISO, etKOI);
 
  // Вероятности распознавания кодировок
  TRecognizeProbabilities = array[TEncodingType] of Single;
 
const
  EncodingNames: array[Low(TEncodingType)..High(TEncodingType)] of string = ('WIN', 'DOS', 'ISO', 'KOI');
 
 
{=============================== Функции модуля ===============================}
 
 
function EncodingTranslate(const S: string; // Перевод кодировки строки Ansi ---> Dos, KOI8, ISO
                           const TranslateDirection: TTranslateDirection): string;
function GetTranslateDirection(const AFrom, ATo: TEncodingType): TTranslateDirection; // Вычисление направления
кодирования
function EncodingToStr(const AEncoding: TEncodingType): string; // Преобразование типа кодировки в строку
function StrToEncoding(const S: string): TEncodingType; // Преобразование строки в тип кодировки
function RecognizeEncoding(ALines: TStrings;
                           const ALinesLimit: Integer;
                           var RP: TRecognizeProbabilities): TEncodingType; overload; // Распознавание кодировки текста
function RecognizeEncoding(const AFileName: string;
                           const ALinesLimit: Integer;
                           var RP: TRecognizeProbabilities;
                           var AError: Boolean): TEncodingType; overload; // Распознавание кодировки файла
function ConfirmEncoding(const RP: TRecognizeProbabilities; const EdgePercent: Integer): Boolean; // Подтверждение
правильности распознавания кодировки
 
var
  // Глобальные переменные, связанные с кодировкой
  AutoEncodingEnabled:     Boolean = True;  // Флаг опции "Автораспознавание кодировки"
  AutoEncodingDepth:       Integer = 128;   // Глубина распознавания
  AutoEncodingEdge:        Integer = 5;     // Порог несрабатывания (показа окна ручного выбора кодировки)
  AutoEncodingFlashStatus: Boolean = False; // Мигать статусбаром при смене
 
{============================== Секция реализации =============================}
implementation
 
{============================== Константы модуля ==============================}
const
 //C00 = #$00;
 C00 = #$20; C10 = #$10; C20 = #$20; C30 = #$30; C40 = #$40; C50 = #$50; C60 = #$60; C70 = #$70;
 C01 = #$01; C11 = #$11; C21 = #$21; C31 = #$31; C41 = #$41; C51 = #$51; C61 = #$61; C71 = #$71;
 C02 = #$02; C12 = #$12; C22 = #$22; C32 = #$32; C42 = #$42; C52 = #$52; C62 = #$62; C72 = #$72;
 C03 = #$03; C13 = #$13; C23 = #$23; C33 = #$33; C43 = #$43; C53 = #$53; C63 = #$63; C73 = #$73;
 C04 = #$04; C14 = #$14; C24 = #$24; C34 = #$34; C44 = #$44; C54 = #$54; C64 = #$64; C74 = #$74;
 C05 = #$05; C15 = #$15; C25 = #$25; C35 = #$35; C45 = #$45; C55 = #$55; C65 = #$65; C75 = #$75;
 C06 = #$06; C16 = #$16; C26 = #$26; C36 = #$36; C46 = #$46; C56 = #$56; C66 = #$66; C76 = #$76;
 C07 = #$07; C17 = #$17; C27 = #$27; C37 = #$37; C47 = #$47; C57 = #$57; C67 = #$67; C77 = #$77;
 C08 = #$08; C18 = #$18; C28 = #$28; C38 = #$38; C48 = #$48; C58 = #$58; C68 = #$68; C78 = #$78;
 C09 = #$09; C19 = #$19; C29 = #$29; C39 = #$39; C49 = #$49; C59 = #$59; C69 = #$69; C79 = #$79;
 C0A = #$0A; C1A = #$1A; C2A = #$2A; C3A = #$3A; C4A = #$4A; C5A = #$5A; C6A = #$6A; C7A = #$7A;
 C0B = #$0B; C1B = #$1B; C2B = #$2B; C3B = #$3B; C4B = #$4B; C5B = #$5B; C6B = #$6B; C7B = #$7B;
 C0C = #$0C; C1C = #$1C; C2C = #$2C; C3C = #$3C; C4C = #$4C; C5C = #$5C; C6C = #$6C; C7C = #$7C;
 C0D = #$0D; C1D = #$1D; C2D = #$2D; C3D = #$3D; C4D = #$4D; C5D = #$5D; C6D = #$6D; C7D = #$7D;
 C0E = #$0E; C1E = #$1E; C2E = #$2E; C3E = #$3E; C4E = #$4E; C5E = #$5E; C6E = #$6E; C7E = #$7E;
 C0F = #$0F; C1F = #$1F; C2F = #$2F; C3F = #$3F; C4F = #$4F; C5F = #$5F; C6F = #$6F; C7F = #$7F;
 
 C80 = #$80; C90 = #$90; CA0 = #$A0; CB0 = #$B0; CC0 = #$C0; CD0 = #$D0; CE0 = #$E0; CF0 = #$F0;
 C81 = #$81; C91 = #$91; CA1 = #$A1; CB1 = #$B1; CC1 = #$C1; CD1 = #$D1; CE1 = #$E1; CF1 = #$F1;
 C82 = #$82; C92 = #$92; CA2 = #$A2; CB2 = #$B2; CC2 = #$C2; CD2 = #$D2; CE2 = #$E2; CF2 = #$F2;
 C83 = #$83; C93 = #$93; CA3 = #$A3; CB3 = #$B3; CC3 = #$C3; CD3 = #$D3; CE3 = #$E3; CF3 = #$F3;
 C84 = #$84; C94 = #$94; CA4 = #$A4; CB4 = #$B4; CC4 = #$C4; CD4 = #$D4; CE4 = #$E4; CF4 = #$F4;
 C85 = #$85; C95 = #$95; CA5 = #$A5; CB5 = #$B5; CC5 = #$C5; CD5 = #$D5; CE5 = #$E5; CF5 = #$F5;
 C86 = #$86; C96 = #$96; CA6 = #$A6; CB6 = #$B6; CC6 = #$C6; CD6 = #$D6; CE6 = #$E6; CF6 = #$F6;
 C87 = #$87; C97 = #$97; CA7 = #$A7; CB7 = #$B7; CC7 = #$C7; CD7 = #$D7; CE7 = #$E7; CF7 = #$F7;
 C88 = #$88; C98 = #$98; CA8 = #$A8; CB8 = #$B8; CC8 = #$C8; CD8 = #$D8; CE8 = #$E8; CF8 = #$F8;
 C89 = #$89; C99 = #$99; CA9 = #$A9; CB9 = #$B9; CC9 = #$C9; CD9 = #$D9; CE9 = #$E9; CF9 = #$F9;
 C8A = #$8A; C9A = #$9A; CAA = #$AA; CBA = #$BA; CCA = #$CA; CDA = #$DA; CEA = #$EA; CFA = #$FA;
 C8B = #$8B; C9B = #$9B; CAB = #$AB; CBB = #$BB; CCB = #$CB; CDB = #$DB; CEB = #$EB; CFB = #$FB;
 C8C = #$8C; C9C = #$9C; CAC = #$AC; CBC = #$BC; CCC = #$CC; CDC = #$DC; CEC = #$EC; CFC = #$FC;
 C8D = #$8D; C9D = #$9D; CAD = #$AD; CBD = #$BD; CCD = #$CD; CDD = #$DD; CED = #$ED; CFD = #$FD;
 C8E = #$8E; C9E = #$9E; CAE = #$AE; CBE = #$BE; CCE = #$CE; CDE = #$DE; CEE = #$EE; CFE = #$FE;
 C8F = #$8F; C9F = #$9F; CAF = #$AF; CBF = #$BF; CCF = #$CF; CDF = #$DF; CEF = #$EF; CFF = #$FF;
 
{***********************************************************************************************}
{*********************************** Таблицы кодировок *****************************************}
{***********************************************************************************************}
 
{---------------------------------------- Windows ----------------------------------------------}
 
 //    0    1    2    3    4    5    6    7    8    9    A    B    C    D    E    F
 AnsiToDosCodePage: TCodePage=(
      C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, C00,  // 80-8F
      C00, C00, C00, C00, C00, 'щ', C00, C00, C00, C00, C00, C00, C00, C00, C00, C00,  // 90-9F
      C00, 'ц', 'ч', C00, 'э', C00, C00, C00, 'р', C00, 'т', C00, C00, C00, C00, 'ф',  // A0-AF
      'ш', C00, C00, C00, C00, C00, C00, 'ъ', 'с', 'ь', 'у', C00, C00, C00, C00, 'х',  // B0-BF
      'Ђ', 'Ѓ', '‚', 'ѓ', '„', '…', '†', '‡', '?', '‰', 'Љ', '‹', 'Њ', 'Ќ', 'Ћ', 'Џ',  // C0-CF
      'ђ', '‘', '’', '“', '”', '•', '–', '—', '˜', '™', 'љ', '›', 'њ', 'ќ', 'ћ', 'џ',  // D0-DF
      CA0, 'Ў', 'ў', 'Ј', '¤', 'Ґ', CA6, '§', 'Ё', '©', 'Є', '«', '¬', '­', '®', 'Ї',  // E0-EF
      'а', 'б', 'в', 'г', 'д', 'е', 'ж', 'з', 'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п'); // F0-FF
 
 //    0    1    2    3    4    5    6    7    8    9    A    B    C    D    E    F
 AnsiToIsoCodePage: TCodePage=(
      C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, C00,  // 80-8F
      C00, C00, C00, C00, C00, CFE, C00, C00, C00, C00, C00, C00, C00, C00, C00, C00,  // 90-9F
      C00, C00, C00, C00, C00, C00, C00, C00, CF0, C00, C00, C00, C00, C00, C00, C00,  // A0-AF
      C00, CFB, C00, C00, C00, C00, C00, C00, CF1, C00, C00, C00, C00, C00, C00, C00,  // B0-BF
      CB0, CB1, CB2, CB3, CB4, CB5, CB6, CB7, CB8, CB9, CBA, CBB, CBC, CBD, CBE, CBF,  // C0-CF
      CC0, CC1, CC2, CC3, CC4, CC5, CC6, CC7, CC8, CC9, CCA, CCB, CCC, CCD, CCE, CCF,  // D0-DF
      CD0, CD1, CD2, CD3, CD4, CD5, CD6, CD7, CD8, CD9, CDA, CDB, CDC, CDD, CDE, CDF,  // E0-EF
      CE0, CE1, CE2, CE3, CE4, CE5, CE6, CE7, CE8, CE9, CEA, CEB, CEC, CED, CEE, CEF); // F0-FF
 
 
 //    0    1    2    3    4    5    6    7    8    9    A    B    C    D    E    F
 AnsiToKoi8CodePage: TCodePage=(
      C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, C00,  // 80-8F
      C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, C00,  // 90-9F
      C00, C00, C00, C00, C00, C00, C00, C00, CE5, C00, C00, CAE, C00, C00, C00, C00,  // A0-AF
      C00, C00, C00, C00, C00, C00, C00, C00, CC5, C00, C00, CAF, C00, C00, C00, C00,  // B0-BF
      CE1, CE2, CF7, CE7, CE4, CE5, CF6, CFA, CE9, CEA, CEB, CEC, CED, CEE, CEF, CF0,  // C0-CF
      CF2, CF3, CF4, CF5, CE6, CE8, CE3, CFE, CFB, CFD, CFF, CF9, CF8, CFC, CE0, CF1,  // D0-DF
      CC1, CC2, CD7, CC7, CC4, CC5, CD6, CDA, CC9, CCA, CCB, CCC, CCD, CCE, CCF, CD0,  // E0-EF
      CD2, CD3, CD4, CD5, CC6, CC8, CC3, CDE, CDB, CDD, CDF, CD9, CD8, CDC, CC0, CD1); // F0-FF
 
{------------------------------------------- Dos -----------------------------------------------}
 
 //    0    1    2    3    4    5    6    7    8    9    A    B    C    D    E    F
 DosToAnsiCodePage: TCodePage=(
      CC0, CC1, CC2, CC3, CC4, CC5, CC6, CC7, CC8, CC9, CCA, CCB, CCC, CCD, CCE, CCF,  // 80-8F
      CD0, CD1, CD2, CD3, CD4, CD5, CD6, CD7, CD8, CD9, CDA, CDB, CDC, CDD, CDE, CDF,  // 90-9F
      CE0, CE1, CE2, CE3, CE4, CE5, CE6, CE7, CE8, CE9, CEA, CEB, CEC, CED, CEE, CEF,  // A0-AF
      C2D, C2D, C2D, CA6, C2B, CA6, CA6, CAC, CAC, CA6, CA6, CAC, C2D, C2D, C2D, CAC,  // B0-BF
      C4C, C2B, C54, C2B, C2D, C2B, CA6, CA6, C4C, CE3, CA6, C54, CA6, C3D, C2B, CA6,  // C0-CF
      CA6, C54, C54, C4C, C4C, C2D, CE3, C2B, C2B, C2D, C2D, C2D, C2D, CA6, CA6, C2D,  // D0-DF
      CF0, CF1, CF2, CF3, CF4, CF5, CF6, CF7, CF8, CF9, CFA, CFB, CFC, CFD, CFE, CFF,  // E0-EF
      CA8, CB8, CAA, CBA, CAF, CBF, CA1, CA2, CB0, C95, CB7, C76, CB9, CA4, CA6, CA0); // F0-FF
 
 //    0    1    2    3    4    5    6    7    8    9    A    B    C    D    E    F
 DosToIsoCodePage: TCodePage=(
      CB0, CB1, CB2, CB3, CB4, CB5, CB6, CB7, CB8, CB9, CBA, CBB, CBC, CBD, CBE, CBF,  // 80-8F
      CC0, CC1, CC2, CC3, CC4, CC5, CC6, CC7, CC8, CC9, CCA, CCB, CCC, CCD, CCE, CCF,  // 90-9F
      CD0, CD1, CD2, CD3, CD4, CD5, CD6, CD7, CD8, CD9, CDA, CDB, CDC, CDD, CDE, CDF,  // A0-AF
      C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F,  // B0-BF
      C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F,  // C0-CF
      C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F,  // D0-DF
      CE0, CE1, CE2, CE3, CE4, CE5, CE6, CE7, CE8, CE9, CEA, CEB, CEC, CED, CEE, CEF,  // E0-EF
      CA1, CF1, CA4, CF4, CA7, CF7, CAE, CAE, C3F, C3F, C2E, C3F, CF0, C24, C3F, CA0); // F0-FF
 
 //    0    1    2    3    4    5    6    7    8    9    A    B    C    D    E    F
 DosToKoi8CodePage: TCodePage=(
      CE1, CE2, CF7, CE7, CE4, CE5, CF6, CFA, CE9, CEA, CEB, CEC, CED, CEE, CEE, CF0,  // 80-8F
      CF2, CF3, CF4, CF5, CE6, CE8, CE3, CFE, CFB, CFD, CFF, CF9, CF8, CFC, CE0, CF1,  // 90-9F
      CC1, CC2, CD7, CC7, CC4, CC5, CD6, CDA, CC9, CCA, CCB, CCC, CCD, CCE, CCF, CD0,  // A0-AF
      C90, C91, C92, C81, C87, CB2, CB4, CA7, CA6, CB5, CA1, CA8, CAE, CAD, CAC, C83,  // B0-BF
      C84, C89, C88, C86, C80, C8A, CAF, CB0, CAB, CA5, CBB, CB8, CB1, CA0, CBE, CB9,  // C0-CF
      CBA, CB6, CB7, CAA, CA9, CA2, CA4, CBD, CBC, C85, C82, C8D, C8C, C8E, C8F, C8B,  // D0-DF
      CD2, CD3, CD4, CD5, CC6, CC8, CC3, CDE, CDB, CDD, CDF, CD9, CD8, CDC, CC0, CD1,  // E0-EF
      CB3, CA3, C3F, C3F, C3F, C3F, C3F, C3F, C9C, C95, C9E, C96, C3F, C3F, C94, C9A); // F0-FF
 
{------------------------------------------ Iso ------------------------------------------------}
 
 //    0    1    2    3    4    5    6    7    8    9    A    B    C    D    E    F
 IsoToAnsiCodePage: TCodePage=(
      C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F,  // 80-8F
      C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C98, C3F, C3F, C3F, C3F, C3F, C3F, C3F,  // 90-9F
      CA0, CA8, C80, C81, CAA, CBD, CB2, CAF, CA3, C8A, C8C, C8E, C8D, CAD, CA1, C8F,  // A0-AF
      CC0, CC1, CC2, CC3, CC4, CC5, CC6, CC7, CC8, CC9, CCA, CCB, CCC, CCD, CCE, CCF,  // B0-BF
      CD0, CD1, CD2, CD3, CD4, CD5, CD6, CD7, CD8, CD9, CDA, CDB, CDC, CDD, CDE, CDF,  // C0-CF
      CE0, CE1, CE2, CE3, CE4, CE5, CE6, CE7, CE8, CE9, CEA, CEB, CEC, CED, CEE, CEF,  // D0-DF
      CF0, CF1, CF2, CF3, CF4, CF5, CF6, CF7, CF8, CF9, CFA, CFB, CFC, CFD, CFE, CFF,  // E0-EF
      CB9, CB8, C90, C83, CBA, CBE, CB3, CBF, CBC, C9A, C9C, C9E, C9D, CA7, CA2, C9F); // F0-FF
 
 //    0    1    2    3    4    5    6    7    8    9    A    B    C    D    E    F
 IsoToDosCodePage: TCodePage=(
      C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F,  // 80-8F
      C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F,  // 90-9F
      CFF, CF0, C3F, C3F, CF2, C3F, C3F, CF4, C3F, C3F, C3F, C3F, C3F, C2D, CF6, C3F,  // A0-AF
      C80, C81, C82, C83, C84, C85, C86, C87, C88, C89, C8A, C8B, C8C, C8D, C8E, C8F,  // B0-BF
      C90, C91, C92, C93, C94, C95, C96, C97, C98, C99, C9A, C9B, C9C, C9D, C9E, C9F,  // C0-CF
      CA0, CA1, CA2, CA3, CA4, CA5, CA6, CA7, CA8, CA9, CAA, CAB, CAC, CAD, CAE, CAF,  // D0-DF
      CE0, CE1, CE2, CE3, CE4, CE5, CE6, CE7, CE8, CE9, CEA, CEB, CEC, CED, CEE, CEF,  // E0-EF
      CFC, CF1, C3F, C3F, CF3, C3F, C3F, CF5, C3F, C3F, C3F, C3F, C3F, C15, CF7, C3F); // F0-FF
 
 //    0    1    2    3    4    5    6    7    8    9    A    B    C    D    E    F
 IsoToKoi8CodePage: TCodePage=(
      C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F,  // 80-8F
      C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F,  // 90-9F
      C9A, CB3, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C2D, C3F, C3F,  // A0-AF
      CE1, CE2, CF7, CE7, CE4, CE5, CF6, CFA, CE9, CEA, CEB, CEC, CED, CEE, CEE, CF0,  // B0-BF
      CF2, CF3, CF4, CF5, CE6, CE8, CE3, CFE, CFB, CFD, CFF, CF9, CF8, CFC, CE0, CF1,  // C0-CF
      CC1, CC2, CD7, CC7, CC4, CC5, CD6, CDA, CC9, CCA, CCB, CCC, CCD, CCE, CCF, CD0,  // D0-DF
      CD2, CD3, CD4, CD5, CC6, CC8, CC3, CDE, CDB, CDD, CDF, CD9, CD8, CDC, CC0, CD1,  // E0-EF
      C3F, CA3, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C15, C3F, C3F); // F0-FF
 
{------------------------------------------ Koi8 -----------------------------------------------}
 
 //    0    1    2    3    4    5    6    7    8    9    A    B    C    D    E    F
 Koi8ToAnsiCodePage: TCodePage=(
      C2D, CA6, C2D, CAC, C4C, C2D, C2B, C2B, C54, C2B, C2B, C2D, C2D, C2D, CA6, CA6,  // 80-8F
      C2D, C2D, C2D, C3F, CA6, C95, C76, C3F, C3F, C3F, CA0, C3F, CB0, C3F, CB7, C3F,  // 90-9F
      C3D, CA6, C2D, CB8, CE3, CE3, CAC, CAC, CAC, C4C, C4C, C4C, C2D, C2D, C2D, CA6,  // A0-AF
      CA6, CA6, CA6, CA8, CA6, CA6, C54, C54, C54, CA6, CA6, CA6, C2B, C2B, C2B, CA9,  // B0-BF
      CFE, CE0, CE1, CF6, CE4, CE5, CF4, CE3, CF5, CE8, CE9, CEA, CEB, CEC, CED, CEE,  // C0-CF
      CEF, CFF, CF0, CF1, CF2, CF3, CE6, CE2, CFC, CFB, CE7, CF8, CFD, CF9, CF7, CFA,  // D0-DF
      CDE, CC0, CC1, CD6, CC4, CC5, CD4, CC3, CD5, CC8, CC9, CCA, CCB, CCC, CCD, CCE,  // E0-EF
      CCF, CDF, CD0, CD1, CD2, CD3, CC6, CC2, CDC, CDB, CC7, CD8, CDD, CD9, CD7, CDA); // F0-FF
 
 //    0    1    2    3    4    5    6    7    8    9    A    B    C    D    E    F
 Koi8ToDosCodePage: TCodePage=(
      CE1, CE2, CF7, CE7, CE4, CE5, CF6, CFA, CE9, CEA, CEB, CEC, CED, CEE, CEF, CF0,  // 80-8F
      CF2, CF3, CF4, CF5, CE6, CE8, CE3, CFE, CFB, CFD, CFF, CF9, CF8, CFC, CE0, CF1,  // 90-9F
      CC1, CC2, CD7, CC7, CC4, CC5, CD6, CDA, CC9, CCA, CCB, CCC, CCD, CCE, CCF, CD0,  // A0-AF
      C90, C91, C92, C81, C87, CB2, CB4, CA7, CA6, CB5, CA1, CA8, CAE, CAD, CAC, C83,  // B0-BF
      C84, C89, C88, C86, C80, C8A, CAF, CB0, CAB, CA5, CBB, CB8, CB1, CA0, CBE, CB9,  // C0-CF
      CBA, CB6, CB7, CAA, CA9, CA2, CA4, CBD, CBC, C85, C82, C8D, C8C, C8E, C8F, C8B,  // D0-DF
      CD2, CD3, CD4, CD5, CC6, CC8, CC3, CDE, CDB, CDD, CDF, CD9, CD8, CDC, CC0, CD1,  // E0-EF
      CB3, CA3, C3F, C3F, C3F, C3F, C3F, C3F, C9C, C95, C9E, C96, C3F, C3F, C94, C9A); // F0-FF
 
 //    0    1    2    3    4    5    6    7    8    9    A    B    C    D    E    F
 Koi8ToIsoCodePage: TCodePage=(
      C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F,  // 80-8F
      C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, CA0, C3F, C3F, C32, C2E, C3F,  // 90-9F
      C3F, C3F, C3F, CF1, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F,  // A0-AF
      C3F, C3F, C3F, CA1, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C43,  // B0-BF
      CEE, CD0, CD1, CE6, CD4, CD5, CE4, CD3, CE5, CD8, CD9, CDA, CDB, CDC, CDD, CDE,  // C0-CF
      CDF, CEF, CE0, CE1, CE2, CE3, CD6, CD2, CEC, CEB, CD7, CE8, CED, CE9, CE7, CEA,  // D0-DF
      CCE, CB0, CB1, CC6, CB4, CB5, CC4, CB3, CC5, CB8, CB9, CBA, CBB, CBC, CBD, CBE,  // E0-EF
      CBF, CCF, CC0, CC1, CC2, CC3, CB6, CB2, CCC, CCB, CB7, CC8, CCD, CC9, CC7, CCA); // F0-FF
 
// Перевод кодировки строки между Ansi, Dos, KOI8, ISO
function EncodingTranslate(const S: string;
                           const TranslateDirection: TTranslateDirection): string;
var
 i:        Integer;
 Code:     Byte;
 CodePage: TCodePage;
begin
  Result:= S;
 
  case TranslateDirection of
    tdAnsiToDos  : CodePage:= AnsiToDosCodePage;
    tdAnsiToKOI8 : CodePage:= AnsiToKOI8CodePage;
    tdAnsiToISO  : CodePage:= AnsiToISOCodePage;
    tdDosToAnsi  : CodePage:= DosToAnsiCodePage;
    tdDosToKOI8  : CodePage:= DosToKOI8CodePage;
    tdDosToISO   : CodePage:= DosToISOCodePage;
    tdKOI8ToAnsi : CodePage:= KOI8ToAnsiCodePage;
    tdKOI8ToDos  : CodePage:= KOI8ToDosCodePage;
    tdKOI8ToISO  : CodePage:= KOI8ToISOCodePage;
    tdISOToAnsi  : CodePage:= ISOToAnsiCodePage;
    tdISOToDos   : CodePage:= ISOToDosCodePage;
    tdISOToKOI8  : CodePage:= ISOToKOI8CodePage;
    tdNone       : Exit;
  end; //case
 
  for i:= 1 to Length(S) do
    begin
      Code:= Ord(Result[i]);
      if Code > 127
        then Result[i]:= CodePage[Code];
    end;
end;
 
// Вычисление направления кодирования
function GetTranslateDirection(const AFrom, ATo: TEncodingType): TTranslateDirection;
begin
  Result:= tdNone;
 
  if AFrom = ATo
    then Exit;
 
  case AFrom of
    etWIN: case ATo of
             etDOS: Result:= tdDosToAnsi;
             etISO: Result:= tdIsoToAnsi;
             etKOI: Result:= tdKoi8ToAnsi;
           end; //case
 
    etDOS: case ATo of
             etWIN: Result:= tdAnsiToDos;
             etISO: Result:= tdIsoToDos;
             etKOI: Result:= tdKoi8ToDos;
           end; //case
 
    etISO: case ATo of
             etWIN: Result:= tdansiToIso;
             etDOS: Result:= tdDosToIso;
             etKOI: Result:= tdKoi8ToIso;
           end; //case
 
    etKOI: case ATo of
             etWIN: Result:= tdAnsiToKoi8;
             etDOS: Result:= tdDosToKoi8;
             etISO: Result:= tdIsoToKoi8;
           end; //case
  end; //case
end;
 
// Преобразование типа кодировки в строку
function EncodingToStr(const AEncoding: TEncodingType): string;
begin
  Result:= EncodingNames[AEncoding];
end;
 
// Преобразование строки в тип кодировки
function StrToEncoding(const S: string): TEncodingType;
var
  eIndex: TEncodingType;
begin
  for eIndex:= Low(TEncodingType) to High(TEncodingType) do
    if S = EncodingNames[eIndex]
      then begin
             Result:= eIndex;
             Exit;
           end;
 
  Result:= Low(TEncodingType); // По умолчанию - Windows.
end;
 
const
  // Кириллица
  RussianLetters: set of Char = ['А'..'я'];
 
  // Вероятности встречаемости русских букв в тексте.
  // Их я получил эмпирическим путём: написал специальную консольную
  // программу подсчёта и скормил ей пару десятков мегабайт текстовых
  // файлов со всякими рассказами, анекдотами и т.п.
  Probabilities: array['А'..'я'] of Single = (
       0.057, 0.010, 0.031, 0.011, 0.021, 0.067, 0.007, 0.013,   // АБВГДЕЖЗ
       0.052, 0.011, 0.023, 0.030, 0.024, 0.043, 0.075, 0.026,   // ИЙКЛМНОП
       0.038, 0.034, 0.046, 0.016, 0.001, 0.006, 0.002, 0.011,   // РСТУФХЦЧ
       0.004, 0.004, 0.000, 0.012, 0.012, 0.003, 0.005, 0.015,   // ШЩЪЫЬЭЮЯ
       0.057, 0.010, 0.031, 0.011, 0.021, 0.067, 0.007, 0.013,   // абвгдежз
       0.052, 0.011, 0.023, 0.030, 0.024, 0.043, 0.075, 0.026,   // ийклмноп
       0.038, 0.034, 0.046, 0.016, 0.001, 0.006, 0.002, 0.011,   // рстуфхцч
       0.004, 0.004, 0.000, 0.012, 0.012, 0.003, 0.005, 0.015);  // шщъыьэюя
 
 
// Распознавание кодировки текста
function RecognizeEncoding(ALines: TStrings;
                           const ALinesLimit: Integer;
                           var RP: TRecognizeProbabilities): TEncodingType;
var
  Index, TempIndex: Integer;
  eIndex: TEncodingType; // Индекс по кодировкам
  Counts: array[TEncodingType] of Single; // Накопитель вероятностей
  Temp: string;
  Max: Single; // Максмиальная сумма вероятностей
  Total: Single; // Общая сумма вероятностей для подсчёта процентов
begin
  // Распознавание
  Total:= 0.0;
  FillChar(Counts, SizeOf(Counts), 0);
  FillChar(RP, SizeOf(RP), 0);
 
  for eIndex:= Low(TEncodingType) to High(TEncodingType) do
    begin
      for Index:= 0 to ALines.Count - 1 do
        begin
          // Перевод тестовой строки в нужную кодировку
          case eIndex of
            etDOS: Temp:= EncodingTranslate(ALines[Index], tdDosToAnsi);
            etISO: Temp:= EncodingTranslate(ALines[Index], tdISOToAnsi);
            etKOI: Temp:= EncodingTranslate(ALines[Index], tdKOI8ToAnsi);
          else     Temp:= ALines[Index];
          end; //case
 
          // Подсчёт вероятностей в строке
          for TempIndex:= 1 to Length(Temp) do
            if (Temp[TempIndex] >= 'А') and (Temp[TempIndex] <= 'я')
              then Counts[eIndex]:= Counts[eIndex] + Probabilities[Temp[TempIndex]];
 
          // Если номер строки превысил лимит - прерываю цикл
          if Index = ALinesLimit - 1
            then Break;
        end;
 
      // Суммирую общее количество вероятностей
      Total:= Total + Counts[eIndex];
    end; // for
 
  // Вычисление максимума и по нему - результата
  Result:= Low(TEncodingType); // По умолчанию
  Max:= 0.000;
 
  for eIndex:= Low(TEncodingType) to High(TEncodingType) do
    if Counts[eIndex] > Max
      then begin
             Max:= Counts[eIndex];
             Result:= eIndex;
           end;
 
  // Формирование процентов вероятностей
  //Temp:= '';
  for eIndex:= Low(TEncodingType) to High(TEncodingType) do
    begin
      if Total > 0.0
        then RP[eIndex]:= Counts[eIndex] / Total
        else RP[eIndex]:= 0.0;
 
      // Добавление в лог для отладки
      //Temp:= Temp + ' ' + EncodingNames[eIndex]  + ': ' + FormatFloat('0.00', 100 * RP[eIndex]) + '%'
      //AddToLog(EncodingNames[eIndex]  + ': ' + FormatFloat('0.00', 100 * RP[eIndex]) + '%');
    end;
 
  //AddToLog(Temp);
end;
 
// Распознавание кодировки файла
function RecognizeEncoding(const AFileName: string;
                           const ALinesLimit: Integer;
                           var RP: TRecognizeProbabilities;
                           var AError: Boolean): TEncodingType;
var
  List: TStringList;
begin
  Result:= etWIN;
  AError:= False;
  List:= TStringList.Create();
  try
    try
      List.LoadFromFile(AFileName);
      Result:= RecognizeEncoding(List, ALinesLimit, RP);
    except on E: Exception do
             begin
               AError:= True;
               Application.MessageBox(PChar('Не могу открыть файл "' + AFileName + '".'#13#10 + E.Message),
                                      PChar('Ошибка'),
                                      MB_OK + MB_ICONERROR);
             end;
    end;
  finally
    List.Free();
  end;
end;
 
// Подтверждение правильности распознавания кодировки
function ConfirmEncoding(const RP: TRecognizeProbabilities; const EdgePercent: Integer): Boolean;
var
  First, Second: Single;
  eIndex: TEncodingType;
begin
  // Кодировка считается распознанной достаточнок достоверно, если максимальная
  // вероятность превосходит вторую по величине вероятность не менее чем
  // на EdgePercent / 100%.
  First:= 0.0;
  Second:= 0.0;
 
  for eIndex:= Low(TEncodingType) to High(TEncodingType) do
    begin
      if RP[eIndex] > First
        then begin
               Second:= First;
               First:= RP[eIndex];
             end
        else if RP[eIndex] > Second
               then Second:= RP[eIndex];
    end;
 
  Result:= (First - Second >= EdgePercent / 100) or // Первая вероятность значительно превышает вторую
           (First < 0.01); // Все вероятности нулевые - кодировка Win (русских букв нету)
end;
 
end.

Ссылка на данный код:

На главную страницу сервиса обмена кодом »