package org.springblade.mdm.utils;
|
import org.apache.tika.detect.Detector;
|
import org.apache.tika.io.TikaInputStream;
|
import org.apache.tika.metadata.Metadata;
|
import org.apache.tika.mime.MediaType;
|
import java.io.IOException;
|
import java.io.InputStream;
|
|
public class AnsiTextDetector implements Detector {
|
// 可打印字符范围(ASCII + 扩展ANSI字符)
|
private static final int PRINTABLE_START = 32;
|
private static final int PRINTABLE_END = 126;
|
private static final int EXTENDED_ANSI_START = 128;
|
private static final int EXTENDED_ANSI_END = 255;
|
|
// 文本判定阈值:可打印字符占比 >= 80%
|
private static final double TEXT_THRESHOLD = 0.8;
|
|
@Override
|
public MediaType detect(InputStream input, Metadata metadata) throws IOException {
|
TikaInputStream tis = TikaInputStream.get(input);
|
byte[] buffer = new byte[1024];
|
int bytesRead = tis.read(buffer);
|
if (bytesRead <= 0) {
|
return MediaType.OCTET_STREAM;
|
}
|
|
int total = 0;
|
int printable = 0;
|
for (int i = 0; i < bytesRead; i++) {
|
byte b = buffer[i];
|
int unsigned = b & 0xFF; // 转为无符号字节
|
total++;
|
// 判断是否为可打印字符(含扩展ANSI)
|
if ((unsigned >= PRINTABLE_START && unsigned <= PRINTABLE_END) ||
|
(unsigned >= EXTENDED_ANSI_START && unsigned <= EXTENDED_ANSI_END)) {
|
printable++;
|
}
|
}
|
|
// 若可打印字符占比达标,判定为文本
|
if ((double) printable / total >= TEXT_THRESHOLD) {
|
return MediaType.TEXT_PLAIN;
|
}
|
return MediaType.OCTET_STREAM;
|
}
|
}
|