package org.springblade.mdm.utils; import org.apache.tika.detect.Detector; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import java.io.IOException; import java.io.InputStream; public class AnsiTextDetector implements Detector { // 可打印字符范围(ASCII + 扩展ANSI字符) private static final int PRINTABLE_START = 32; private static final int PRINTABLE_END = 126; private static final int EXTENDED_ANSI_START = 128; private static final int EXTENDED_ANSI_END = 255; // 文本判定阈值:可打印字符占比 >= 80% private static final double TEXT_THRESHOLD = 0.8; @Override public MediaType detect(InputStream input, Metadata metadata) throws IOException { TikaInputStream tis = TikaInputStream.get(input); byte[] buffer = new byte[1024]; int bytesRead = tis.read(buffer); if (bytesRead <= 0) { return MediaType.OCTET_STREAM; } int total = 0; int printable = 0; for (int i = 0; i < bytesRead; i++) { byte b = buffer[i]; int unsigned = b & 0xFF; // 转为无符号字节 total++; // 判断是否为可打印字符(含扩展ANSI) if ((unsigned >= PRINTABLE_START && unsigned <= PRINTABLE_END) || (unsigned >= EXTENDED_ANSI_START && unsigned <= EXTENDED_ANSI_END)) { printable++; } } // 若可打印字符占比达标,判定为文本 if ((double) printable / total >= TEXT_THRESHOLD) { return MediaType.TEXT_PLAIN; } return MediaType.OCTET_STREAM; } }