fineweb-2-japanese-text-cleaner / scripts /noise_detecter.py

Upload 2 files

92c0372 verified 10 months ago

7.03 kB

	import unicodedata
	from typing import List, Tuple

	import torch
	from transformers import AutoModelForTokenClassification, AutoTokenizer


	class NoiseDetector:
	def __init__(self, model_path: str):
	self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	self.model = AutoModelForTokenClassification.from_pretrained(model_path).to(
	self.device
	)
	self.tokenizer = AutoTokenizer.from_pretrained(model_path)
	self.model.eval()

	def _normalize_text(self, text: str) -> str:
	return unicodedata.normalize("NFKC", text)

	def _convert_token_spans_to_char_spans(
	self,
	text: str,
	noise_token_indices: List[int],
	offset_mapping: List[Tuple[int, int]],
	) -> List[Tuple[int, int]]:
	char_spans = []
	current_span = None

	for idx, (is_noise, (start, end)) in enumerate(
	zip(noise_token_indices, offset_mapping)
	):
	# Skip special tokens (CLS, SEP, etc.)
	if start == end == 0:
	continue

	if is_noise:
	if current_span is None:
	current_span = [start, end]
	else:
	current_span[1] = end
	elif current_span is not None:
	char_spans.append(tuple(current_span))
	current_span = None

	# Don't forget to add the last span if it exists
	if current_span is not None:
	char_spans.append(tuple(current_span))

	return char_spans

	def detect(
	self, texts: List[str], threshold: float = 0.5
	) -> List[List[Tuple[int, int]]]:
	"""
	Detect noise spans in the given texts.

	Args:
	texts: List of input texts
	threshold: Confidence threshold for noise detection (default: 0.5)

	Returns:
	List of lists containing (start, end) character positions of detected noise spans for each text
	"""
	results = []

	with torch.no_grad():
	for text in texts:
	# Normalize text
	normalized_text = self._normalize_text(text)

	# Tokenize
	tokens = self.tokenizer(
	normalized_text,
	truncation=True,
	return_offsets_mapping=True,
	return_tensors="pt",
	)

	# Move to device
	input_ids = tokens["input_ids"].to(self.device)
	attention_mask = tokens["attention_mask"].to(self.device)

	# Get predictions
	outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
	logits = outputs.logits

	# Convert logits to probabilities
	probs = torch.softmax(logits, dim=-1)

	# Get noise predictions (class 1)
	noise_probs = probs[0, :, 1].cpu().numpy()
	noise_predictions = (noise_probs > threshold).astype(int)

	# Convert token-level predictions to character spans
	char_spans = self._convert_token_spans_to_char_spans(
	normalized_text,
	noise_predictions,
	tokens["offset_mapping"][0].tolist(),
	)

	results.append(char_spans)

	return results

	def detect_and_highlight(
	self, texts: List[str], threshold: float = 0.5
	) -> List[str]:
	"""
	Detect noise spans and return texts with noise sections highlighted.

	Args:
	texts: List of input texts
	threshold: Confidence threshold for noise detection (default: 0.5)

	Returns:
	List of texts with noise sections wrapped in [NOISE]...[/NOISE] tags
	"""
	noise_spans = self.detect(texts, threshold)
	highlighted_texts = []

	for text, spans in zip(texts, noise_spans):
	if not spans:
	highlighted_texts.append(text)
	continue

	# Sort spans by start position
	spans = sorted(spans)

	# Build highlighted text
	result = []
	last_end = 0

	for start, end in spans:
	# Add text before noise
	result.append(text[last_end:start])
	# Add highlighted noise
	# もし長さがN以下なら、ハイライトしない
	if end - start > 3:
	result.append(f"[NOISE]{text[start:end]}[/NOISE]")
	else:
	result.append(text[start:end])
	# result.append(f"[NOISE]{text[start:end]}[/NOISE]")
	last_end = end

	# Add remaining text
	result.append(text[last_end:])

	highlighted_texts.append("".join(result))

	return highlighted_texts


	def main():
	model_path = "hotchpotch/fineweb-2-japanese-text-cleaner"
	detector = NoiseDetector(model_path)

	NOISE_TEXT = """
	この文章は90日以上更新の無いサイトに表示されています。
	ログインログアウト

	本当に必要な文章以外にも、さまざまなノイズが含まれていることがあります。例えば、この文章もその一例です。本来不要なテキストが入ってしまうことがこのようにあるでしょう。

	今なら50%オフ！クリックしてリンク先の商品を表示

	とりわけ文章長が短い場合、文章のほとんどがノイズを含む可能性があります。それらを取り除くことで、より高品質の文章を抽出できないかと考えています。

	前のページ次のページ
	""".strip()

	texts = [
	NOISE_TEXT,
	"これは正常なテキストです。しかし、ここに🤣絵文字があります。そして普通の文章が続きます。",
	"普通の文章です。ASCII ART(^_^)があります。最後も普通です。",
	"ログイン文章の密ベクトルは、情報検索・文章判別・類似文章抽出など、さまざまな用途に使うことができます。しかしながら最先端のTransformerモデルは小さいモデルでも、とりわけCPU環境では処理速度が遅いため実用でないこともしばしばあります。この課題を解決する新しいアプローチとして、先日公開されたTransformerモデル「ではない」 StaticEmbeddingモデルは、例えば intfloat/multilingual-e5-small (以下mE5-small)とのベンチマーク比較では85%のスコアという最低十分な性能で、何よりCPUで動作時に126倍高速に文ベクトルを作成することができる、という驚きの速度です。記事の一覧＞",
	]

	highlighted_texts = detector.detect_and_highlight(texts, threshold=0.7)
	for text in highlighted_texts:
	print(f"\n{text}")


	if __name__ == "__main__":
	main()