pentest_analyzer.py

import os
import sys
import logging
import hashlib
import json
import torch
import numpy as np
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Any, Optional
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

from llama_index.core import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    StorageContext,
    load_index_from_storage,
    Document
)
from llama_index.core.prompts import PromptTemplate
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore
import chromadb

class DocumentProcessor:
    """Handles document preprocessing and cleaning"""
    
    @staticmethod
    def clean_text(text: str) -> str:
        """Clean and normalize text content"""
        # Remove excessive whitespace
        text = " ".join(text.split())
        # Remove special characters (keep basic punctuation)
        text = "".join(char for char in text if char.isprintable())
        return text
    
    @staticmethod
    def chunk_text(text: str, chunk_size: int = 1024) -> List[str]:
        """Split text into smaller chunks"""
        words = text.split()
        chunks = []
        current_chunk = []
        current_size = 0
        
        for word in words:
            current_size += len(word) + 1  # +1 for space
            if current_size > chunk_size:
                chunks.append(" ".join(current_chunk))
                current_chunk = [word]
                current_size = len(word)
            else:
                current_chunk.append(word)
                
        if current_chunk:
            chunks.append(" ".join(current_chunk))
            
        return chunks

class FileProcessor:
    """Handles file operations and parsing"""
    
    supported_extensions = {'.txt', '.json', '.md', '.xml', '.csv', '.log'}
    
    @staticmethod
    def is_supported_file(file_path: Path) -> bool:
        """Check if file type is supported"""
        return file_path.suffix.lower() in FileProcessor.supported_extensions
    
    @staticmethod
    def get_file_metadata(file_path: Path) -> Dict[str, Any]:
        """Extract file metadata"""
        return {
            'file_name': file_path.name,
            'file_type': file_path.suffix.lower()[1:],
            'file_size': os.path.getsize(file_path),
            'last_modified': datetime.fromtimestamp(
                os.path.getmtime(file_path)
            ).isoformat()
        }

class OptimizedPentestAnalyzer:
    def __init__(self, 
                 persist_dir: str = "./stored_embeddings",
                 chroma_dir: str = "./chroma_db",
                 batch_size: int = 50,
                 max_workers: int = 4):
        """
        Initialize the Pentest Analyzer with optimized settings
        
        Args:
            persist_dir: Directory for storing embeddings
            chroma_dir: Directory for ChromaDB
            batch_size: Size of batches for processing
            max_workers: Maximum number of threads for parallel processing
        """
        self.persist_dir = Path(persist_dir)
        self.chroma_dir = Path(chroma_dir)
        self.batch_size = batch_size
        self.max_workers = max_workers
        
        # Initialize components
        self.setup_logging()
        self.setup_device()
        self.initialize_models()
        
        # Create necessary directories
        self.persist_dir.mkdir(parents=True, exist_ok=True)
        self.chroma_dir.mkdir(parents=True, exist_ok=True)
    
    def setup_logging(self):
        """Configure logging"""
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.StreamHandler(sys.stdout),
                logging.FileHandler("pentest_analyzer.log")
            ]
        )
        self.logger = logging.getLogger(__name__)
    
    def setup_device(self):
        """Setup device for model inference"""
        if torch.backends.mps.is_available():  # Mac M3 chip support
            self.device = "mps"
        elif torch.cuda.is_available():
            self.device = "cuda"
        else:
            self.device = "cpu"
        self.logger.info(f"Using device: {self.device}")
    
    def initialize_models(self):
        """Initialize LLM and embedding models"""
        try:
            # Get Ollama API endpoint from environment variables
            ollama_host = os.getenv('OLLAMA_HOST', 'localhost')
            ollama_port = os.getenv('OLLAMA_PORT', '11434')
            ollama_endpoint = f"http://{ollama_host}:{ollama_port}"
            # Initialize Ollama
            self.llm = Ollama(
                model="deepseek-r1:latest",
                request_timeout=300.0,
                endpoint=ollama_endpoint
            )
            
            # Initialize embedding model
            self.embed_model = HuggingFaceEmbedding(
                model_name="BAAI/bge-small-en-v1.5",
                device=self.device,
                max_length=512
            )
            
            self.logger.info("Models initialized successfully")
        except Exception as e:
            self.logger.error(f"Error initializing models: {e}")
            raise
    
    def get_document_hash(self, documents: List[Document]) -> str:
        """Generate a hash of documents to detect changes"""
        content = "".join(doc.text for doc in documents)
        return hashlib.md5(content.encode()).hexdigest()
    
    def process_single_file(self, file_path: Path) -> Optional[Document]:
        """Process a single file with error handling"""
        try:
            if not FileProcessor.is_supported_file(file_path):
                self.logger.warning(f"Unsupported file type: {file_path}")
                return None
                
            metadata = FileProcessor.get_file_metadata(file_path)
            
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
                
            # Clean and preprocess text
            cleaned_text = DocumentProcessor.clean_text(content)
            
            # Create document with metadata
            return Document(
                text=cleaned_text,
                metadata=metadata
            )
            
        except Exception as e:
            self.logger.error(f"Error processing file {file_path}: {e}")
            return None
    
    def parallel_document_loading(self, directory: str) -> List[Document]:
        """Load and process documents in parallel"""
        try:
            # Convert to absolute path and expand user directory
            directory_path = Path(directory).expanduser().resolve()
            
            if not directory_path.exists():
                raise FileNotFoundError(f"Directory not found: {directory_path}")
            
            self.logger.info(f"Scanning directory: {directory_path}")
            
            # Recursively find all files
            all_files = []
            for ext in FileProcessor.supported_extensions:
                all_files.extend(directory_path.rglob(f"*{ext}"))
            
            if not all_files:
                raise ValueError(f"No supported files found in {directory_path}\n"
                               f"Supported extensions: {FileProcessor.supported_extensions}")
            
            self.logger.info(f"Found {len(all_files)} supported files")
            
            documents = []
            with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
                future_to_file = {
                    executor.submit(self.process_single_file, file_path): file_path
                    for file_path in all_files
                }
                
                for future in tqdm(
                    as_completed(future_to_file), 
                    total=len(all_files),
                    desc="Processing files"
                ):
                    file_path = future_to_file[future]
                    try:
                        doc = future.result()
                        if doc is not None:
                            documents.append(doc)
                    except Exception as e:
                        self.logger.error(f"Error processing {file_path}: {e}")
            
            if not documents:
                raise ValueError("No documents were successfully processed")
                
            self.logger.info(f"Successfully processed {len(documents)} documents")
            return documents
            
        except Exception as e:
            self.logger.error(f"Error in document loading: {e}")
            raise

class OptimizedPentestAnalyzer(OptimizedPentestAnalyzer):  # Continuing from Part 1
    def create_or_load_index(self, documents: List[Document]) -> VectorStoreIndex:
        """Create new index or load cached index based on document hash"""
        document_hash = self.get_document_hash(documents)
        hash_file = self.persist_dir / "document_hash.json"
        
        # Check if cache exists and is valid
        if self._is_cache_valid(hash_file, document_hash):
            return self._load_cached_index()
        
        return self._create_new_index(documents, document_hash)
    
    def _is_cache_valid(self, hash_file: Path, current_hash: str) -> bool:
        """Check if cached embeddings are valid"""
        try:
            if not hash_file.exists():
                return False
                
            with open(hash_file, 'r') as f:
                cache_info = json.load(f)
                
            return (
                cache_info.get('hash') == current_hash and
                cache_info.get('embed_model') == self.embed_model.model_name and
                (datetime.now() - datetime.fromisoformat(cache_info.get('created_at'))).days < 7  # Cache expires after 7 days
            )
        except Exception as e:
            self.logger.error(f"Error checking cache validity: {e}")
            return False
    
    def _load_cached_index(self) -> VectorStoreIndex:
        """Load index from cache"""
        try:
            self.logger.info("Loading cached index...")
            
            # Initialize ChromaDB client
            chroma_client = chromadb.PersistentClient(path=str(self.chroma_dir))
            chroma_collection = chroma_client.get_or_create_collection("pentest_data")
            
            # Create storage context
            storage_context = StorageContext.from_defaults(
                persist_dir=str(self.persist_dir),
                vector_store=ChromaVectorStore(chroma_collection=chroma_collection)
            )
            
            # Load index
            return load_index_from_storage(storage_context)
            
        except Exception as e:
            self.logger.error(f"Error loading cached index: {e}")
            raise
    
    def _create_new_index(self, documents: List[Document], document_hash: str) -> VectorStoreIndex:
        """Create new index with optimized batch processing"""
        try:
            self.logger.info("Creating new index...")
            
            # Initialize ChromaDB
            chroma_client = chromadb.PersistentClient(path=str(self.chroma_dir))
            chroma_collection = chroma_client.get_or_create_collection(
                name="pentest_data",
                metadata={"description": "Pentest findings and analysis"}
            )
            
            # Create vector store
            vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
            
            # Create storage context
            storage_context = StorageContext.from_defaults(vector_store=vector_store)
            
            # Process documents in batches
            index = self._batch_process_documents(documents, storage_context)
            
            # Save index and cache information
            self._save_index_and_cache(index, document_hash)
            
            return index
            
        except Exception as e:
            self.logger.error(f"Error creating new index: {e}")
            raise
    
    def _batch_process_documents(
        self, 
        documents: List[Document], 
        storage_context: StorageContext
    ) -> VectorStoreIndex:
        """Process documents in batches to optimize memory usage"""
        try:
            all_docs = []
            
            for i in tqdm(range(0, len(documents), self.batch_size), desc="Processing document batches"):
                batch = documents[i:i + self.batch_size]
                
                # Process each document in the batch
                processed_batch = []
                for doc in batch:
                    # Chunk large documents
                    if len(doc.text) > 4096:  # Adjust threshold as needed
                        chunks = DocumentProcessor.chunk_text(doc.text)
                        for idx, chunk in enumerate(chunks):
                            chunk_doc = Document(
                                text=chunk,
                                metadata={
                                    **doc.metadata,
                                    'chunk_id': idx,
                                    'total_chunks': len(chunks)
                                }
                            )
                            processed_batch.append(chunk_doc)
                    else:
                        processed_batch.append(doc)
                
                all_docs.extend(processed_batch)
                
                # Clear memory periodically
                if i % (self.batch_size * 5) == 0:
                    torch.cuda.empty_cache() if torch.cuda.is_available() else None
            
            return VectorStoreIndex.from_documents(
                all_docs,
                storage_context=storage_context,
                embed_model=self.embed_model,
                show_progress=True
            )
            
        except Exception as e:
            self.logger.error(f"Error in batch processing: {e}")
            raise
    
    def _save_index_and_cache(self, index: VectorStoreIndex, document_hash: str):
        """Save index and cache information"""
        try:
            # Save index
            index.storage_context.persist(persist_dir=str(self.persist_dir))
            
            # Save cache information
            cache_info = {
                'hash': document_hash,
                'embed_model': self.embed_model.model_name,
                'created_at': datetime.now().isoformat(),
                'total_documents': len(index.docstore.docs)
            }
            
            with open(self.persist_dir / "document_hash.json", 'w') as f:
                json.dump(cache_info, f, indent=2)
                
        except Exception as e:
            self.logger.error(f"Error saving index and cache: {e}")
            raise
    
    def generate_report(self, index: VectorStoreIndex) -> str:
        """Generate comprehensive pentest report"""
        try:
            from llama_index.core.prompts import PromptTemplate
            # Ensure we're using the Ollama LLM instance
            if not hasattr(self, 'llm') or self.llm is None:
                self.llm = Ollama(model="deepseek-r1:latest", request_timeout=300.0)
            # Custom prompt template for report generation
            template = PromptTemplate("""
            As a senior penetration tester, analyze the provided security findings and generate 
            a comprehensive penetration testing report following this structure:

            1. Executive Summary
               - Overview of key findings
               - Risk assessment summary
               - Critical recommendations

            2. Methodology
               - Testing approach
               - Tools and techniques used
               - Scope of assessment

            3. Findings and Vulnerabilities
               {findings_structure}
               For each finding include:
               - Description
               - Impact
               - Proof of Concept
               - Remediation Steps

            4. Risk Analysis
               - Risk scoring methodology
               - Business impact assessment
               - Exploitation likelihood

            5. Detailed Technical Analysis
               - Attack vectors
               - System vulnerabilities
               - Configuration issues

            6. Remediation Roadmap
               - Prioritized recommendations
               - Timeline suggestions
               - Resource requirements

            Context Information:
            {context_str}

            Generate a detailed, professional report based on the provided data.
            Focus on actionable insights and clear technical explanations.
            """)
            
            # Configure query engine
            query_engine = index.as_query_engine(
                text_qa_template=template,
                similarity_top_k=5,  # Retrieve top 5 most relevant chunks
                context_window=4096,  # Increased context window
                streaming=True,  # Enable streaming for large reports
                llm=self.llm  # Explicitly setting the Ollama LLM here
            )
            
            # Generate report
            response = query_engine.query(
                "Generate a complete penetration testing report based on the provided data."
            )
            # Format the report
            if hasattr(response, 'response'):
                return response.response
            else:
                return str(response)
            
        except Exception as e:
            self.logger.error(f"Error generating report: {e}")
            self.logger.error(f"Error details: {str(e.__class__.__name__)}")
            raise
    
    def process_and_analyze(self, directory: str) -> str:
        """Main method to process documents and generate report"""
        start_time = datetime.now()
        
        try:
            # Load documents with parallel processing
            self.logger.info("Loading documents...")
            documents = self.parallel_document_loading(directory)
            print(directory)
            
            if not documents:
                raise ValueError("No valid documents found in the specified directory")
            
            # Create or load index
            self.logger.info("Processing documents and creating index...")
            index = self.create_or_load_index(documents)
            
            # Generate report
            self.logger.info("Generating report...")
            report = self.generate_report(index)
            
            # Save report
            report_path = Path("pentest_report.md")
            with open(report_path, "w") as f:
                f.write(report)
            
            processing_time = datetime.now() - start_time
            self.logger.info(f"Total processing time: {processing_time}")
            
            return report
            
        except Exception as e:
            self.logger.error(f"Error in document processing and analysis: {e}")
            raise


def scan_directory(directory: str) -> None:
    """Utility function to scan directory and show file information"""
    try:
        directory_path = Path(directory).expanduser().resolve()
        print(f"\nScanning directory: {directory_path}")
        
        if not directory_path.exists():
            print(f"Directory not found: {directory_path}")
            return
        
        print("\nFound files:")
        for ext in FileProcessor.supported_extensions:
            files = list(directory_path.rglob(f"*{ext}"))
            if files:
                print(f"\n{ext} files:")
                for file in files:
                    size = os.path.getsize(file) / 1024  # Convert to KB
                    print(f"- {file.relative_to(directory_path)} ({size:.2f} KB)")
        
        print("\nUnsupported files:")
        all_files = set(directory_path.rglob("*"))
        supported_files = set()
        for ext in FileProcessor.supported_extensions:
            supported_files.update(directory_path.rglob(f"*{ext}"))
        
        unsupported = [f for f in all_files - supported_files if f.is_file()]
        for file in unsupported[:10]:  # Show first 10 unsupported files
            print(f"- {file.relative_to(directory_path)}")
        
        if len(unsupported) > 10:
            print(f"... and {len(unsupported) - 10} more")
            
    except Exception as e:
        print(f"Error scanning directory: {e}")

def main():
    """Main execution function"""
    # Configuration
    CONFIG = {
        'persist_dir': './stored_embeddings',
        'chroma_dir': './chroma_db',
        'batch_size': 50,
        'max_workers': 4
    }
    
    try:
        # Initialize analyzer
        analyzer = OptimizedPentestAnalyzer(**CONFIG)
        
        # Get input directory
        input_dir = input("Enter the path to your pentest data directory: ").strip()
        if not input_dir:
            input_dir = "./demo data/"  # Default path
        
        # Expand user path and convert to absolute path
        input_dir = os.path.expanduser(input_dir)
        input_dir = os.path.abspath(input_dir)
        
        # Verify directory exists
        if not os.path.exists(input_dir):
            raise FileNotFoundError(f"Directory not found: {input_dir}")
        
        scan_directory(input_dir)
        
        proceed = input("\nProceed with processing? (y/n): ").lower().strip()
        if proceed != 'y':
            print("Operation cancelled by user")
            sys.exit(0)   

        print(f"\nProcessing files from: {input_dir}")
        print(f"Supported file types: {', '.join(FileProcessor.supported_extensions)}\n")
        
        # Process documents and generate report
        report = analyzer.process_and_analyze(input_dir)
        
        print("\nReport generated successfully!")
        print(f"Report saved to: {os.path.abspath('pentest_report.md')}")
        
    except Exception as e:
        print(f"\nError: {str(e)}")
        
        # Print additional troubleshooting information
        print("\nTroubleshooting steps:")
        print("1. Verify the directory path is correct")
        print("2. Ensure the directory contains supported file types:", 
              ', '.join(FileProcessor.supported_extensions))
        print("3. Check file permissions")
        print("\nFor detailed error information, check the log file: pentest_analyzer.log")
        sys.exit(1)

if __name__ == "__main__":
    main()