rust 判断txt文本编码 ANSI UTF-8 UTF-8 BOM Unicode

180it 2024-10-20 AM 26℃ 0条
use std::env;
use std::fs::File;
use std::io::{self, Read};
use std::path::Path;

#[derive(Debug)]
enum TextType {
    TextAnsi,
    TextUtf8,
    TextUtf8Bom,
    TextUtf16Le,
    TextUtf16Be,
    TextUnknown,
}

// 检查是否为无BOM的UTF8
fn check_utf8_without_bom(file_name: &str) -> bool {
    let mut file_in = match File::open(file_name) {
        Ok(file) => file,
        Err(_) => {
            println!("打开文件失败");
            return false;
        }
    };

    let mut buffer = Vec::new();
    file_in.read_to_end(&mut buffer).expect("无法读取文件内容");

    let mut n = 0;
    let mut b_all_ascii = true;

    for &ch in &buffer {
        if (ch & 0x80) != 0 {
            b_all_ascii = false;
        }

        if n == 0 {
            if ch >= 0x80 {
                if ch >= 0xFC && ch <= 0xFD {
                    n = 6;
                } else if ch >= 0xF8 {
                    n = 5;
                } else if ch >= 0xF0 {
                    n = 4;
                } else if ch >= 0xE0 {
                    n = 3;
                } else if ch >= 0xC0 {
                    n = 2;
                } else {
                    return false;
                }
                n -= 1;
            }
        } else {
            if (ch & 0xC0) != 0x80 {
                return false;
            }
            n -= 1;
        }
    }

    if n > 0 {
        return false;
    }

    if b_all_ascii {
        return false;
    }

    true
}

// 检查文本编码
fn check_text_encode(file_name: &str) -> TextType {
    let mut file_in = match File::open(file_name) {
        Ok(file) => file,
        Err(_) => {
            println!("打开文件失败");
            return TextType::TextUnknown;
        }
    };

    let mut buffer = [0; 2];
    file_in.read_exact(&mut buffer).expect("无法读取文件头");

    let head = u16::from_be_bytes(buffer);

    match head {
        0xFFFE => TextType::TextUtf16Le,
        0xFEFF => TextType::TextUtf16Be,
        0xEFBB => TextType::TextUtf8Bom,
        _ => {
            if check_utf8_without_bom(file_name) {
                TextType::TextUtf8
            } else {
                TextType::TextAnsi
            }
        }
    }
}

fn main() {
    let args: Vec<String> = env::args().collect();

    if args.len() > 1 {
        let file_name = &args[1];
        println!("file: {}", file_name);

        let text_type = check_text_encode(file_name);
        let code = match text_type {
            TextType::TextAnsi => "ANSI",
            TextType::TextUtf8 => "UTF-8",
            TextType::TextUtf8Bom => "UTF-8 BOM",
            TextType::TextUtf16Le => "Unicode",
            TextType::TextUtf16Be => "Unicode big endian",
            TextType::TextUnknown => "Unknown",
        };

        println!("file code: {}", code);
    } else {
        println!("请将TXT文件拖放到exe程序上.");
    }
}
支付宝打赏支付宝打赏 微信打赏微信打赏

如果文章或资源对您有帮助,欢迎打赏作者。一路走来,感谢有您!

标签: none

rust 判断txt文本编码 ANSI UTF-8 UTF-8 BOM Unicode