Coverage for src/qdrant_loader/connectors/localfile/metadata_extractor.py: 91%
44 statements
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-04 05:50 +0000
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-04 05:50 +0000
1import os
2import re
3from typing import Any
5import chardet
7from qdrant_loader.utils.logging import LoggingConfig
10class LocalFileMetadataExtractor:
11 """Extract metadata from local files."""
13 def __init__(self, base_path: str):
14 self.base_path = base_path
15 self.logger = LoggingConfig.get_logger(__name__)
17 def extract_all_metadata(self, file_path: str, content: str) -> dict[str, Any]:
18 self.logger.debug(f"Starting metadata extraction for file: {file_path!s}")
19 file_metadata = self._extract_file_metadata(file_path, content)
20 structure_metadata = {}
21 if file_path.lower().endswith(".md"):
22 structure_metadata = self._extract_structure_metadata(content)
23 metadata = {**file_metadata, **structure_metadata}
24 self.logger.debug(f"Completed metadata extraction for {file_path!s}.")
25 self.logger.debug(f"Metadata: {metadata!s}")
26 return metadata
28 def _extract_file_metadata(self, file_path: str, content: str) -> dict[str, Any]:
29 rel_path = os.path.relpath(file_path, self.base_path)
30 file_type = os.path.splitext(rel_path)[1]
31 file_name = os.path.basename(rel_path)
32 file_encoding = self._detect_encoding(content)
33 line_count = len(content.splitlines())
34 word_count = len(content.split())
35 file_size = len(content.encode(file_encoding))
36 return {
37 "file_type": file_type,
38 "file_name": file_name,
39 "file_directory": os.path.dirname("/" + rel_path),
40 "file_encoding": file_encoding,
41 "line_count": line_count,
42 "word_count": word_count,
43 "file_size": file_size,
44 }
46 def _extract_structure_metadata(self, content: str) -> dict[str, Any]:
47 headings = re.findall(
48 r"(?:^|\n)\s*(#{1,6})\s+(.+?)(?:\n|$)", content, re.MULTILINE
49 )
50 has_toc = "## Table of Contents" in content or "## Contents" in content
51 heading_levels = [len(h[0]) for h in headings]
52 sections_count = len(heading_levels)
53 return {
54 "has_toc": has_toc,
55 "heading_levels": heading_levels,
56 "sections_count": sections_count,
57 }
59 def _detect_encoding(self, content: str) -> str:
60 if not content:
61 return "utf-8"
62 try:
63 result = chardet.detect(content.encode())
64 if (
65 result["encoding"]
66 and result["encoding"].lower() != "ascii"
67 and result["confidence"] > 0.8
68 ):
69 return result["encoding"].lower()
70 except Exception as e:
71 self.logger.error({"event": "Failed to detect encoding", "error": str(e)})
72 return "utf-8"