Coverage for src/qdrant_loader/core/chunking/strategy/code/metadata/dependencies.py: 93%

29 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-08 06:05 +0000

1from __future__ import annotations 

2 

3import re 

4 

5 

6def build_dependency_graph(content: str) -> dict[str, list[str]]: 

7 dependencies: dict[str, list[str]] = { 

8 "imports": [], 

9 "internal_references": [], 

10 "third_party_imports": [], 

11 "stdlib_imports": [], 

12 } 

13 

14 import_patterns = [ 

15 r"import\s+([a-zA-Z_][a-zA-Z0-9_.]*)", 

16 r"from\s+([a-zA-Z_][a-zA-Z0-9_.]*)\s+import", 

17 r'#include\s*[<"]([^>"]+)[>"]', 

18 r"require\s*\([\'\"]([^\'\"]+)[\'\"]\)", 

19 r"import\s+.*\s+from\s+[\'\"]([^\'\"]+)[\'\"]", 

20 ] 

21 

22 for pattern in import_patterns: 

23 imports = re.findall(pattern, content) 

24 dependencies["imports"].extend(imports) 

25 

26 python_stdlib = { 

27 "os", 

28 "sys", 

29 "json", 

30 "math", 

31 "random", 

32 "datetime", 

33 "collections", 

34 "itertools", 

35 "functools", 

36 "operator", 

37 "re", 

38 "urllib", 

39 "http", 

40 "pathlib", 

41 "typing", 

42 "dataclasses", 

43 "abc", 

44 "enum", 

45 "logging", 

46 "threading", 

47 "multiprocessing", 

48 "subprocess", 

49 "socket", 

50 "sqlite3", 

51 "csv", 

52 "pickle", 

53 "gzip", 

54 "zipfile", 

55 "tarfile", 

56 "shutil", 

57 "tempfile", 

58 } 

59 

60 for imp in dependencies["imports"]: 

61 base_module = imp.split(".")[0] 

62 if base_module in python_stdlib: 

63 dependencies["stdlib_imports"].append(imp) 

64 elif is_third_party_import(imp): 

65 dependencies["third_party_imports"].append(imp) 

66 else: 

67 dependencies["internal_references"].append(imp) 

68 

69 return dependencies 

70 

71 

72def is_third_party_import(import_name: str) -> bool: 

73 base_module = import_name.split(".")[0].lower() 

74 known_third_party = { 

75 "requests", 

76 "numpy", 

77 "pandas", 

78 "flask", 

79 "django", 

80 "fastapi", 

81 "tensorflow", 

82 "torch", 

83 "pytorch", 

84 "sklearn", 

85 "scipy", 

86 "matplotlib", 

87 "seaborn", 

88 "plotly", 

89 "streamlit", 

90 "dash", 

91 "celery", 

92 "redis", 

93 "sqlalchemy", 

94 "alembic", 

95 "pydantic", 

96 "marshmallow", 

97 "click", 

98 "typer", 

99 "pytest", 

100 "unittest2", 

101 "mock", 

102 "httpx", 

103 "aiohttp", 

104 "websockets", 

105 "uvicorn", 

106 "gunicorn", 

107 "jinja2", 

108 "mako", 

109 "babel", 

110 "pillow", 

111 "opencv", 

112 "cv2", 

113 "boto3", 

114 "azure", 

115 "google", 

116 } 

117 if base_module in known_third_party: 

118 return True 

119 if any(pattern in base_module for pattern in ["lib", "client", "sdk", "api"]): 

120 return True 

121 if "_" in base_module and not base_module.startswith("_"): 

122 return True 

123 if ( 

124 base_module.islower() 

125 and not base_module.startswith("test") 

126 and base_module not in ["main", "app", "config", "utils", "helpers"] 

127 ): 

128 return True 

129 return False