Просмотр кода
Идентификатор: 9b1205f4 Описание: Код загружен: 12 июля 2011, 13:32 (min@y™)
{ Модуль работы с кодировками. © min@y™ (minay.tm@gmail.com)} unit uEncoding; interface uses //============================== Модули проекта ============================== uLogForm, //=============================== Левые модули =============================== //=================== Системные модули и модули компонентов ================== SysUtils, Menus, Windows, Classes, Forms; type TTranslateDirection = (tdNone, tdAnsiToDos, tdAnsiToKOI8, tdAnsiToISO, tdDosToAnsi, tdDosToKOI8, tdDosToISO, tdKOI8ToAnsi, tdKOI8ToDos, tdKOI8ToISO, tdISOToAnsi, tdISOToDos, tdISOToKOI8); TCodePage = array[$80..$FF] of char; type // Тип кодировки TEncodingType = (etWIN, etDOS, etISO, etKOI); // Вероятности распознавания кодировок TRecognizeProbabilities = array[TEncodingType] of Single; const EncodingNames: array[Low(TEncodingType)..High(TEncodingType)] of string = ('WIN', 'DOS', 'ISO', 'KOI'); {=============================== Функции модуля ===============================} function EncodingTranslate(const S: string; // Перевод кодировки строки Ansi ---> Dos, KOI8, ISO const TranslateDirection: TTranslateDirection): string; function GetTranslateDirection(const AFrom, ATo: TEncodingType): TTranslateDirection; // Вычисление направления кодирования function EncodingToStr(const AEncoding: TEncodingType): string; // Преобразование типа кодировки в строку function StrToEncoding(const S: string): TEncodingType; // Преобразование строки в тип кодировки function RecognizeEncoding(ALines: TStrings; const ALinesLimit: Integer; var RP: TRecognizeProbabilities): TEncodingType; overload; // Распознавание кодировки текста function RecognizeEncoding(const AFileName: string; const ALinesLimit: Integer; var RP: TRecognizeProbabilities; var AError: Boolean): TEncodingType; overload; // Распознавание кодировки файла function ConfirmEncoding(const RP: TRecognizeProbabilities; const EdgePercent: Integer): Boolean; // Подтверждение правильности распознавания кодировки var // Глобальные переменные, связанные с кодировкой AutoEncodingEnabled: Boolean = True; // Флаг опции "Автораспознавание кодировки" AutoEncodingDepth: Integer = 128; // Глубина распознавания AutoEncodingEdge: Integer = 5; // Порог несрабатывания (показа окна ручного выбора кодировки) AutoEncodingFlashStatus: Boolean = False; // Мигать статусбаром при смене {============================== Секция реализации =============================} implementation {============================== Константы модуля ==============================} const //C00 = #$00; C00 = #$20; C10 = #$10; C20 = #$20; C30 = #$30; C40 = #$40; C50 = #$50; C60 = #$60; C70 = #$70; C01 = #$01; C11 = #$11; C21 = #$21; C31 = #$31; C41 = #$41; C51 = #$51; C61 = #$61; C71 = #$71; C02 = #$02; C12 = #$12; C22 = #$22; C32 = #$32; C42 = #$42; C52 = #$52; C62 = #$62; C72 = #$72; C03 = #$03; C13 = #$13; C23 = #$23; C33 = #$33; C43 = #$43; C53 = #$53; C63 = #$63; C73 = #$73; C04 = #$04; C14 = #$14; C24 = #$24; C34 = #$34; C44 = #$44; C54 = #$54; C64 = #$64; C74 = #$74; C05 = #$05; C15 = #$15; C25 = #$25; C35 = #$35; C45 = #$45; C55 = #$55; C65 = #$65; C75 = #$75; C06 = #$06; C16 = #$16; C26 = #$26; C36 = #$36; C46 = #$46; C56 = #$56; C66 = #$66; C76 = #$76; C07 = #$07; C17 = #$17; C27 = #$27; C37 = #$37; C47 = #$47; C57 = #$57; C67 = #$67; C77 = #$77; C08 = #$08; C18 = #$18; C28 = #$28; C38 = #$38; C48 = #$48; C58 = #$58; C68 = #$68; C78 = #$78; C09 = #$09; C19 = #$19; C29 = #$29; C39 = #$39; C49 = #$49; C59 = #$59; C69 = #$69; C79 = #$79; C0A = #$0A; C1A = #$1A; C2A = #$2A; C3A = #$3A; C4A = #$4A; C5A = #$5A; C6A = #$6A; C7A = #$7A; C0B = #$0B; C1B = #$1B; C2B = #$2B; C3B = #$3B; C4B = #$4B; C5B = #$5B; C6B = #$6B; C7B = #$7B; C0C = #$0C; C1C = #$1C; C2C = #$2C; C3C = #$3C; C4C = #$4C; C5C = #$5C; C6C = #$6C; C7C = #$7C; C0D = #$0D; C1D = #$1D; C2D = #$2D; C3D = #$3D; C4D = #$4D; C5D = #$5D; C6D = #$6D; C7D = #$7D; C0E = #$0E; C1E = #$1E; C2E = #$2E; C3E = #$3E; C4E = #$4E; C5E = #$5E; C6E = #$6E; C7E = #$7E; C0F = #$0F; C1F = #$1F; C2F = #$2F; C3F = #$3F; C4F = #$4F; C5F = #$5F; C6F = #$6F; C7F = #$7F; C80 = #$80; C90 = #$90; CA0 = #$A0; CB0 = #$B0; CC0 = #$C0; CD0 = #$D0; CE0 = #$E0; CF0 = #$F0; C81 = #$81; C91 = #$91; CA1 = #$A1; CB1 = #$B1; CC1 = #$C1; CD1 = #$D1; CE1 = #$E1; CF1 = #$F1; C82 = #$82; C92 = #$92; CA2 = #$A2; CB2 = #$B2; CC2 = #$C2; CD2 = #$D2; CE2 = #$E2; CF2 = #$F2; C83 = #$83; C93 = #$93; CA3 = #$A3; CB3 = #$B3; CC3 = #$C3; CD3 = #$D3; CE3 = #$E3; CF3 = #$F3; C84 = #$84; C94 = #$94; CA4 = #$A4; CB4 = #$B4; CC4 = #$C4; CD4 = #$D4; CE4 = #$E4; CF4 = #$F4; C85 = #$85; C95 = #$95; CA5 = #$A5; CB5 = #$B5; CC5 = #$C5; CD5 = #$D5; CE5 = #$E5; CF5 = #$F5; C86 = #$86; C96 = #$96; CA6 = #$A6; CB6 = #$B6; CC6 = #$C6; CD6 = #$D6; CE6 = #$E6; CF6 = #$F6; C87 = #$87; C97 = #$97; CA7 = #$A7; CB7 = #$B7; CC7 = #$C7; CD7 = #$D7; CE7 = #$E7; CF7 = #$F7; C88 = #$88; C98 = #$98; CA8 = #$A8; CB8 = #$B8; CC8 = #$C8; CD8 = #$D8; CE8 = #$E8; CF8 = #$F8; C89 = #$89; C99 = #$99; CA9 = #$A9; CB9 = #$B9; CC9 = #$C9; CD9 = #$D9; CE9 = #$E9; CF9 = #$F9; C8A = #$8A; C9A = #$9A; CAA = #$AA; CBA = #$BA; CCA = #$CA; CDA = #$DA; CEA = #$EA; CFA = #$FA; C8B = #$8B; C9B = #$9B; CAB = #$AB; CBB = #$BB; CCB = #$CB; CDB = #$DB; CEB = #$EB; CFB = #$FB; C8C = #$8C; C9C = #$9C; CAC = #$AC; CBC = #$BC; CCC = #$CC; CDC = #$DC; CEC = #$EC; CFC = #$FC; C8D = #$8D; C9D = #$9D; CAD = #$AD; CBD = #$BD; CCD = #$CD; CDD = #$DD; CED = #$ED; CFD = #$FD; C8E = #$8E; C9E = #$9E; CAE = #$AE; CBE = #$BE; CCE = #$CE; CDE = #$DE; CEE = #$EE; CFE = #$FE; C8F = #$8F; C9F = #$9F; CAF = #$AF; CBF = #$BF; CCF = #$CF; CDF = #$DF; CEF = #$EF; CFF = #$FF; {***********************************************************************************************} {*********************************** Таблицы кодировок *****************************************} {***********************************************************************************************} {---------------------------------------- Windows ----------------------------------------------} // 0 1 2 3 4 5 6 7 8 9 A B C D E F AnsiToDosCodePage: TCodePage=( C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, // 80-8F C00, C00, C00, C00, C00, 'щ', C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, // 90-9F C00, 'ц', 'ч', C00, 'э', C00, C00, C00, 'р', C00, 'т', C00, C00, C00, C00, 'ф', // A0-AF 'ш', C00, C00, C00, C00, C00, C00, 'ъ', 'с', 'ь', 'у', C00, C00, C00, C00, 'х', // B0-BF 'Ђ', 'Ѓ', '‚', 'ѓ', '„', '…', '†', '‡', '?', '‰', 'Љ', '‹', 'Њ', 'Ќ', 'Ћ', 'Џ', // C0-CF 'ђ', '‘', '’', '“', '”', '•', '–', '—', '˜', '™', 'љ', '›', 'њ', 'ќ', 'ћ', 'џ', // D0-DF CA0, 'Ў', 'ў', 'Ј', '¤', 'Ґ', CA6, '§', 'Ё', '©', 'Є', '«', '¬', '', '®', 'Ї', // E0-EF 'а', 'б', 'в', 'г', 'д', 'е', 'ж', 'з', 'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п'); // F0-FF // 0 1 2 3 4 5 6 7 8 9 A B C D E F AnsiToIsoCodePage: TCodePage=( C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, // 80-8F C00, C00, C00, C00, C00, CFE, C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, // 90-9F C00, C00, C00, C00, C00, C00, C00, C00, CF0, C00, C00, C00, C00, C00, C00, C00, // A0-AF C00, CFB, C00, C00, C00, C00, C00, C00, CF1, C00, C00, C00, C00, C00, C00, C00, // B0-BF CB0, CB1, CB2, CB3, CB4, CB5, CB6, CB7, CB8, CB9, CBA, CBB, CBC, CBD, CBE, CBF, // C0-CF CC0, CC1, CC2, CC3, CC4, CC5, CC6, CC7, CC8, CC9, CCA, CCB, CCC, CCD, CCE, CCF, // D0-DF CD0, CD1, CD2, CD3, CD4, CD5, CD6, CD7, CD8, CD9, CDA, CDB, CDC, CDD, CDE, CDF, // E0-EF CE0, CE1, CE2, CE3, CE4, CE5, CE6, CE7, CE8, CE9, CEA, CEB, CEC, CED, CEE, CEF); // F0-FF // 0 1 2 3 4 5 6 7 8 9 A B C D E F AnsiToKoi8CodePage: TCodePage=( C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, // 80-8F C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, C00, // 90-9F C00, C00, C00, C00, C00, C00, C00, C00, CE5, C00, C00, CAE, C00, C00, C00, C00, // A0-AF C00, C00, C00, C00, C00, C00, C00, C00, CC5, C00, C00, CAF, C00, C00, C00, C00, // B0-BF CE1, CE2, CF7, CE7, CE4, CE5, CF6, CFA, CE9, CEA, CEB, CEC, CED, CEE, CEF, CF0, // C0-CF CF2, CF3, CF4, CF5, CE6, CE8, CE3, CFE, CFB, CFD, CFF, CF9, CF8, CFC, CE0, CF1, // D0-DF CC1, CC2, CD7, CC7, CC4, CC5, CD6, CDA, CC9, CCA, CCB, CCC, CCD, CCE, CCF, CD0, // E0-EF CD2, CD3, CD4, CD5, CC6, CC8, CC3, CDE, CDB, CDD, CDF, CD9, CD8, CDC, CC0, CD1); // F0-FF {------------------------------------------- Dos -----------------------------------------------} // 0 1 2 3 4 5 6 7 8 9 A B C D E F DosToAnsiCodePage: TCodePage=( CC0, CC1, CC2, CC3, CC4, CC5, CC6, CC7, CC8, CC9, CCA, CCB, CCC, CCD, CCE, CCF, // 80-8F CD0, CD1, CD2, CD3, CD4, CD5, CD6, CD7, CD8, CD9, CDA, CDB, CDC, CDD, CDE, CDF, // 90-9F CE0, CE1, CE2, CE3, CE4, CE5, CE6, CE7, CE8, CE9, CEA, CEB, CEC, CED, CEE, CEF, // A0-AF C2D, C2D, C2D, CA6, C2B, CA6, CA6, CAC, CAC, CA6, CA6, CAC, C2D, C2D, C2D, CAC, // B0-BF C4C, C2B, C54, C2B, C2D, C2B, CA6, CA6, C4C, CE3, CA6, C54, CA6, C3D, C2B, CA6, // C0-CF CA6, C54, C54, C4C, C4C, C2D, CE3, C2B, C2B, C2D, C2D, C2D, C2D, CA6, CA6, C2D, // D0-DF CF0, CF1, CF2, CF3, CF4, CF5, CF6, CF7, CF8, CF9, CFA, CFB, CFC, CFD, CFE, CFF, // E0-EF CA8, CB8, CAA, CBA, CAF, CBF, CA1, CA2, CB0, C95, CB7, C76, CB9, CA4, CA6, CA0); // F0-FF // 0 1 2 3 4 5 6 7 8 9 A B C D E F DosToIsoCodePage: TCodePage=( CB0, CB1, CB2, CB3, CB4, CB5, CB6, CB7, CB8, CB9, CBA, CBB, CBC, CBD, CBE, CBF, // 80-8F CC0, CC1, CC2, CC3, CC4, CC5, CC6, CC7, CC8, CC9, CCA, CCB, CCC, CCD, CCE, CCF, // 90-9F CD0, CD1, CD2, CD3, CD4, CD5, CD6, CD7, CD8, CD9, CDA, CDB, CDC, CDD, CDE, CDF, // A0-AF C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, // B0-BF C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, // C0-CF C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, // D0-DF CE0, CE1, CE2, CE3, CE4, CE5, CE6, CE7, CE8, CE9, CEA, CEB, CEC, CED, CEE, CEF, // E0-EF CA1, CF1, CA4, CF4, CA7, CF7, CAE, CAE, C3F, C3F, C2E, C3F, CF0, C24, C3F, CA0); // F0-FF // 0 1 2 3 4 5 6 7 8 9 A B C D E F DosToKoi8CodePage: TCodePage=( CE1, CE2, CF7, CE7, CE4, CE5, CF6, CFA, CE9, CEA, CEB, CEC, CED, CEE, CEE, CF0, // 80-8F CF2, CF3, CF4, CF5, CE6, CE8, CE3, CFE, CFB, CFD, CFF, CF9, CF8, CFC, CE0, CF1, // 90-9F CC1, CC2, CD7, CC7, CC4, CC5, CD6, CDA, CC9, CCA, CCB, CCC, CCD, CCE, CCF, CD0, // A0-AF C90, C91, C92, C81, C87, CB2, CB4, CA7, CA6, CB5, CA1, CA8, CAE, CAD, CAC, C83, // B0-BF C84, C89, C88, C86, C80, C8A, CAF, CB0, CAB, CA5, CBB, CB8, CB1, CA0, CBE, CB9, // C0-CF CBA, CB6, CB7, CAA, CA9, CA2, CA4, CBD, CBC, C85, C82, C8D, C8C, C8E, C8F, C8B, // D0-DF CD2, CD3, CD4, CD5, CC6, CC8, CC3, CDE, CDB, CDD, CDF, CD9, CD8, CDC, CC0, CD1, // E0-EF CB3, CA3, C3F, C3F, C3F, C3F, C3F, C3F, C9C, C95, C9E, C96, C3F, C3F, C94, C9A); // F0-FF {------------------------------------------ Iso ------------------------------------------------} // 0 1 2 3 4 5 6 7 8 9 A B C D E F IsoToAnsiCodePage: TCodePage=( C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, // 80-8F C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C98, C3F, C3F, C3F, C3F, C3F, C3F, C3F, // 90-9F CA0, CA8, C80, C81, CAA, CBD, CB2, CAF, CA3, C8A, C8C, C8E, C8D, CAD, CA1, C8F, // A0-AF CC0, CC1, CC2, CC3, CC4, CC5, CC6, CC7, CC8, CC9, CCA, CCB, CCC, CCD, CCE, CCF, // B0-BF CD0, CD1, CD2, CD3, CD4, CD5, CD6, CD7, CD8, CD9, CDA, CDB, CDC, CDD, CDE, CDF, // C0-CF CE0, CE1, CE2, CE3, CE4, CE5, CE6, CE7, CE8, CE9, CEA, CEB, CEC, CED, CEE, CEF, // D0-DF CF0, CF1, CF2, CF3, CF4, CF5, CF6, CF7, CF8, CF9, CFA, CFB, CFC, CFD, CFE, CFF, // E0-EF CB9, CB8, C90, C83, CBA, CBE, CB3, CBF, CBC, C9A, C9C, C9E, C9D, CA7, CA2, C9F); // F0-FF // 0 1 2 3 4 5 6 7 8 9 A B C D E F IsoToDosCodePage: TCodePage=( C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, // 80-8F C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, // 90-9F CFF, CF0, C3F, C3F, CF2, C3F, C3F, CF4, C3F, C3F, C3F, C3F, C3F, C2D, CF6, C3F, // A0-AF C80, C81, C82, C83, C84, C85, C86, C87, C88, C89, C8A, C8B, C8C, C8D, C8E, C8F, // B0-BF C90, C91, C92, C93, C94, C95, C96, C97, C98, C99, C9A, C9B, C9C, C9D, C9E, C9F, // C0-CF CA0, CA1, CA2, CA3, CA4, CA5, CA6, CA7, CA8, CA9, CAA, CAB, CAC, CAD, CAE, CAF, // D0-DF CE0, CE1, CE2, CE3, CE4, CE5, CE6, CE7, CE8, CE9, CEA, CEB, CEC, CED, CEE, CEF, // E0-EF CFC, CF1, C3F, C3F, CF3, C3F, C3F, CF5, C3F, C3F, C3F, C3F, C3F, C15, CF7, C3F); // F0-FF // 0 1 2 3 4 5 6 7 8 9 A B C D E F IsoToKoi8CodePage: TCodePage=( C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, // 80-8F C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, // 90-9F C9A, CB3, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C2D, C3F, C3F, // A0-AF CE1, CE2, CF7, CE7, CE4, CE5, CF6, CFA, CE9, CEA, CEB, CEC, CED, CEE, CEE, CF0, // B0-BF CF2, CF3, CF4, CF5, CE6, CE8, CE3, CFE, CFB, CFD, CFF, CF9, CF8, CFC, CE0, CF1, // C0-CF CC1, CC2, CD7, CC7, CC4, CC5, CD6, CDA, CC9, CCA, CCB, CCC, CCD, CCE, CCF, CD0, // D0-DF CD2, CD3, CD4, CD5, CC6, CC8, CC3, CDE, CDB, CDD, CDF, CD9, CD8, CDC, CC0, CD1, // E0-EF C3F, CA3, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C15, C3F, C3F); // F0-FF {------------------------------------------ Koi8 -----------------------------------------------} // 0 1 2 3 4 5 6 7 8 9 A B C D E F Koi8ToAnsiCodePage: TCodePage=( C2D, CA6, C2D, CAC, C4C, C2D, C2B, C2B, C54, C2B, C2B, C2D, C2D, C2D, CA6, CA6, // 80-8F C2D, C2D, C2D, C3F, CA6, C95, C76, C3F, C3F, C3F, CA0, C3F, CB0, C3F, CB7, C3F, // 90-9F C3D, CA6, C2D, CB8, CE3, CE3, CAC, CAC, CAC, C4C, C4C, C4C, C2D, C2D, C2D, CA6, // A0-AF CA6, CA6, CA6, CA8, CA6, CA6, C54, C54, C54, CA6, CA6, CA6, C2B, C2B, C2B, CA9, // B0-BF CFE, CE0, CE1, CF6, CE4, CE5, CF4, CE3, CF5, CE8, CE9, CEA, CEB, CEC, CED, CEE, // C0-CF CEF, CFF, CF0, CF1, CF2, CF3, CE6, CE2, CFC, CFB, CE7, CF8, CFD, CF9, CF7, CFA, // D0-DF CDE, CC0, CC1, CD6, CC4, CC5, CD4, CC3, CD5, CC8, CC9, CCA, CCB, CCC, CCD, CCE, // E0-EF CCF, CDF, CD0, CD1, CD2, CD3, CC6, CC2, CDC, CDB, CC7, CD8, CDD, CD9, CD7, CDA); // F0-FF // 0 1 2 3 4 5 6 7 8 9 A B C D E F Koi8ToDosCodePage: TCodePage=( CE1, CE2, CF7, CE7, CE4, CE5, CF6, CFA, CE9, CEA, CEB, CEC, CED, CEE, CEF, CF0, // 80-8F CF2, CF3, CF4, CF5, CE6, CE8, CE3, CFE, CFB, CFD, CFF, CF9, CF8, CFC, CE0, CF1, // 90-9F CC1, CC2, CD7, CC7, CC4, CC5, CD6, CDA, CC9, CCA, CCB, CCC, CCD, CCE, CCF, CD0, // A0-AF C90, C91, C92, C81, C87, CB2, CB4, CA7, CA6, CB5, CA1, CA8, CAE, CAD, CAC, C83, // B0-BF C84, C89, C88, C86, C80, C8A, CAF, CB0, CAB, CA5, CBB, CB8, CB1, CA0, CBE, CB9, // C0-CF CBA, CB6, CB7, CAA, CA9, CA2, CA4, CBD, CBC, C85, C82, C8D, C8C, C8E, C8F, C8B, // D0-DF CD2, CD3, CD4, CD5, CC6, CC8, CC3, CDE, CDB, CDD, CDF, CD9, CD8, CDC, CC0, CD1, // E0-EF CB3, CA3, C3F, C3F, C3F, C3F, C3F, C3F, C9C, C95, C9E, C96, C3F, C3F, C94, C9A); // F0-FF // 0 1 2 3 4 5 6 7 8 9 A B C D E F Koi8ToIsoCodePage: TCodePage=( C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, // 80-8F C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, CA0, C3F, C3F, C32, C2E, C3F, // 90-9F C3F, C3F, C3F, CF1, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, // A0-AF C3F, C3F, C3F, CA1, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C3F, C43, // B0-BF CEE, CD0, CD1, CE6, CD4, CD5, CE4, CD3, CE5, CD8, CD9, CDA, CDB, CDC, CDD, CDE, // C0-CF CDF, CEF, CE0, CE1, CE2, CE3, CD6, CD2, CEC, CEB, CD7, CE8, CED, CE9, CE7, CEA, // D0-DF CCE, CB0, CB1, CC6, CB4, CB5, CC4, CB3, CC5, CB8, CB9, CBA, CBB, CBC, CBD, CBE, // E0-EF CBF, CCF, CC0, CC1, CC2, CC3, CB6, CB2, CCC, CCB, CB7, CC8, CCD, CC9, CC7, CCA); // F0-FF // Перевод кодировки строки между Ansi, Dos, KOI8, ISO function EncodingTranslate(const S: string; const TranslateDirection: TTranslateDirection): string; var i: Integer; Code: Byte; CodePage: TCodePage; begin Result:= S; case TranslateDirection of tdAnsiToDos : CodePage:= AnsiToDosCodePage; tdAnsiToKOI8 : CodePage:= AnsiToKOI8CodePage; tdAnsiToISO : CodePage:= AnsiToISOCodePage; tdDosToAnsi : CodePage:= DosToAnsiCodePage; tdDosToKOI8 : CodePage:= DosToKOI8CodePage; tdDosToISO : CodePage:= DosToISOCodePage; tdKOI8ToAnsi : CodePage:= KOI8ToAnsiCodePage; tdKOI8ToDos : CodePage:= KOI8ToDosCodePage; tdKOI8ToISO : CodePage:= KOI8ToISOCodePage; tdISOToAnsi : CodePage:= ISOToAnsiCodePage; tdISOToDos : CodePage:= ISOToDosCodePage; tdISOToKOI8 : CodePage:= ISOToKOI8CodePage; tdNone : Exit; end; //case for i:= 1 to Length(S) do begin Code:= Ord(Result[i]); if Code > 127 then Result[i]:= CodePage[Code]; end; end; // Вычисление направления кодирования function GetTranslateDirection(const AFrom, ATo: TEncodingType): TTranslateDirection; begin Result:= tdNone; if AFrom = ATo then Exit; case AFrom of etWIN: case ATo of etDOS: Result:= tdDosToAnsi; etISO: Result:= tdIsoToAnsi; etKOI: Result:= tdKoi8ToAnsi; end; //case etDOS: case ATo of etWIN: Result:= tdAnsiToDos; etISO: Result:= tdIsoToDos; etKOI: Result:= tdKoi8ToDos; end; //case etISO: case ATo of etWIN: Result:= tdansiToIso; etDOS: Result:= tdDosToIso; etKOI: Result:= tdKoi8ToIso; end; //case etKOI: case ATo of etWIN: Result:= tdAnsiToKoi8; etDOS: Result:= tdDosToKoi8; etISO: Result:= tdIsoToKoi8; end; //case end; //case end; // Преобразование типа кодировки в строку function EncodingToStr(const AEncoding: TEncodingType): string; begin Result:= EncodingNames[AEncoding]; end; // Преобразование строки в тип кодировки function StrToEncoding(const S: string): TEncodingType; var eIndex: TEncodingType; begin for eIndex:= Low(TEncodingType) to High(TEncodingType) do if S = EncodingNames[eIndex] then begin Result:= eIndex; Exit; end; Result:= Low(TEncodingType); // По умолчанию - Windows. end; const // Кириллица RussianLetters: set of Char = ['А'..'я']; // Вероятности встречаемости русских букв в тексте. // Их я получил эмпирическим путём: написал специальную консольную // программу подсчёта и скормил ей пару десятков мегабайт текстовых // файлов со всякими рассказами, анекдотами и т.п. Probabilities: array['А'..'я'] of Single = ( 0.057, 0.010, 0.031, 0.011, 0.021, 0.067, 0.007, 0.013, // АБВГДЕЖЗ 0.052, 0.011, 0.023, 0.030, 0.024, 0.043, 0.075, 0.026, // ИЙКЛМНОП 0.038, 0.034, 0.046, 0.016, 0.001, 0.006, 0.002, 0.011, // РСТУФХЦЧ 0.004, 0.004, 0.000, 0.012, 0.012, 0.003, 0.005, 0.015, // ШЩЪЫЬЭЮЯ 0.057, 0.010, 0.031, 0.011, 0.021, 0.067, 0.007, 0.013, // абвгдежз 0.052, 0.011, 0.023, 0.030, 0.024, 0.043, 0.075, 0.026, // ийклмноп 0.038, 0.034, 0.046, 0.016, 0.001, 0.006, 0.002, 0.011, // рстуфхцч 0.004, 0.004, 0.000, 0.012, 0.012, 0.003, 0.005, 0.015); // шщъыьэюя // Распознавание кодировки текста function RecognizeEncoding(ALines: TStrings; const ALinesLimit: Integer; var RP: TRecognizeProbabilities): TEncodingType; var Index, TempIndex: Integer; eIndex: TEncodingType; // Индекс по кодировкам Counts: array[TEncodingType] of Single; // Накопитель вероятностей Temp: string; Max: Single; // Максмиальная сумма вероятностей Total: Single; // Общая сумма вероятностей для подсчёта процентов begin // Распознавание Total:= 0.0; FillChar(Counts, SizeOf(Counts), 0); FillChar(RP, SizeOf(RP), 0); for eIndex:= Low(TEncodingType) to High(TEncodingType) do begin for Index:= 0 to ALines.Count - 1 do begin // Перевод тестовой строки в нужную кодировку case eIndex of etDOS: Temp:= EncodingTranslate(ALines[Index], tdDosToAnsi); etISO: Temp:= EncodingTranslate(ALines[Index], tdISOToAnsi); etKOI: Temp:= EncodingTranslate(ALines[Index], tdKOI8ToAnsi); else Temp:= ALines[Index]; end; //case // Подсчёт вероятностей в строке for TempIndex:= 1 to Length(Temp) do if (Temp[TempIndex] >= 'А') and (Temp[TempIndex] <= 'я') then Counts[eIndex]:= Counts[eIndex] + Probabilities[Temp[TempIndex]]; // Если номер строки превысил лимит - прерываю цикл if Index = ALinesLimit - 1 then Break; end; // Суммирую общее количество вероятностей Total:= Total + Counts[eIndex]; end; // for // Вычисление максимума и по нему - результата Result:= Low(TEncodingType); // По умолчанию Max:= 0.000; for eIndex:= Low(TEncodingType) to High(TEncodingType) do if Counts[eIndex] > Max then begin Max:= Counts[eIndex]; Result:= eIndex; end; // Формирование процентов вероятностей //Temp:= ''; for eIndex:= Low(TEncodingType) to High(TEncodingType) do begin if Total > 0.0 then RP[eIndex]:= Counts[eIndex] / Total else RP[eIndex]:= 0.0; // Добавление в лог для отладки //Temp:= Temp + ' ' + EncodingNames[eIndex] + ': ' + FormatFloat('0.00', 100 * RP[eIndex]) + '%' //AddToLog(EncodingNames[eIndex] + ': ' + FormatFloat('0.00', 100 * RP[eIndex]) + '%'); end; //AddToLog(Temp); end; // Распознавание кодировки файла function RecognizeEncoding(const AFileName: string; const ALinesLimit: Integer; var RP: TRecognizeProbabilities; var AError: Boolean): TEncodingType; var List: TStringList; begin Result:= etWIN; AError:= False; List:= TStringList.Create(); try try List.LoadFromFile(AFileName); Result:= RecognizeEncoding(List, ALinesLimit, RP); except on E: Exception do begin AError:= True; Application.MessageBox(PChar('Не могу открыть файл "' + AFileName + '".'#13#10 + E.Message), PChar('Ошибка'), MB_OK + MB_ICONERROR); end; end; finally List.Free(); end; end; // Подтверждение правильности распознавания кодировки function ConfirmEncoding(const RP: TRecognizeProbabilities; const EdgePercent: Integer): Boolean; var First, Second: Single; eIndex: TEncodingType; begin // Кодировка считается распознанной достаточнок достоверно, если максимальная // вероятность превосходит вторую по величине вероятность не менее чем // на EdgePercent / 100%. First:= 0.0; Second:= 0.0; for eIndex:= Low(TEncodingType) to High(TEncodingType) do begin if RP[eIndex] > First then begin Second:= First; First:= RP[eIndex]; end else if RP[eIndex] > Second then Second:= RP[eIndex]; end; Result:= (First - Second >= EdgePercent / 100) or // Первая вероятность значительно превышает вторую (First < 0.01); // Все вероятности нулевые - кодировка Win (русских букв нету) end; end.