diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index bc97a64..af120ff 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -43,13 +43,13 @@ jobs: - name: Build and test with uv run: | uv venv --python ${{ matrix.python-version }} - CMAKE_ARGS="-DENABLE_GLCACHE=OFF -DENABLE_LRB=OFF -DENABLE_3L_CACHE=OFF" uv pip install -e .[dev] -vvv + uv pip install -e .[dev] -vvv uv run python -c "import libcachesim; print('✓ Import successful for Python ${{ matrix.python-version }} on ${{ matrix.os }}')" - name: Run tests run: | if [ -d "tests" ]; then - uv run python -m pytest tests/ -v + uv run python -m pytest tests/ -v -m "not optional" else echo "No tests directory found, skipping tests" fi diff --git a/CMakeLists.txt b/CMakeLists.txt index 05bdbc2..3e63c5c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,25 +12,15 @@ endif() message(STATUS "CMAKE_BUILD_TYPE: ${CMAKE_BUILD_TYPE}") # Options -option(ENABLE_GLCACHE "Enable group-learned cache" ON) -option(ENABLE_LRB "Enable LRB" ON) -option(ENABLE_3L_CACHE "Enable 3LCache" ON) +option(ENABLE_GLCACHE "Enable group-learned cache" OFF) +option(ENABLE_LRB "Enable LRB" OFF) +option(ENABLE_3L_CACHE "Enable 3LCache" OFF) # C++ standard set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_POSITION_INDEPENDENT_CODE ON) -# ============================================================================ -set(USER_PREFIX "$ENV{HOME}/local") - -list(APPEND CMAKE_PREFIX_PATH "${USER_PREFIX}") -list(APPEND CMAKE_LIBRARY_PATH "${USER_PREFIX}/lib") -list(APPEND CMAKE_INCLUDE_PATH "${USER_PREFIX}/include") - -include_directories("${USER_PREFIX}/include") -link_directories("${USER_PREFIX}/lib") - # ============================================================================= # Compiler Flags Configuration # ============================================================================= @@ -165,18 +155,6 @@ configure_logging() # Dependency Management # ============================================================================= -# Add user-installed dependencies to search paths -if(DEFINED ENV{CMAKE_PREFIX_PATH}) - list(PREPEND CMAKE_PREFIX_PATH $ENV{CMAKE_PREFIX_PATH}) -endif() - -# Add common user installation paths -set(USER_PREFIX_PATHS - "$ENV{HOME}/local" - "$ENV{HOME}/.local" - "/usr/local" -) - # Find required packages find_package(Python3 REQUIRED COMPONENTS Interpreter Development.Module) find_package(pybind11 CONFIG REQUIRED) @@ -192,85 +170,23 @@ include_directories(${GLib_INCLUDE_DIRS}) link_directories(${GLib_LIBRARY_DIRS}) list(APPEND required_libs ${GLib_LIBRARIES}) -# ZSTD dependency - try multiple find methods -find_package(ZSTD QUIET) -if(NOT ZSTD_FOUND) - # Try pkg-config - pkg_check_modules(ZSTD_PC QUIET libzstd) - if(ZSTD_PC_FOUND) - set(ZSTD_FOUND TRUE) - set(ZSTD_INCLUDE_DIR ${ZSTD_PC_INCLUDE_DIRS}) - set(ZSTD_LIBRARIES ${ZSTD_PC_LIBRARIES}) - set(ZSTD_LIBRARY_DIRS ${ZSTD_PC_LIBRARY_DIRS}) - else() - # Try manual find - find_path(ZSTD_INCLUDE_DIR zstd.h - PATHS ${CMAKE_INCLUDE_PATH} - PATH_SUFFIXES zstd - ) - find_library(ZSTD_LIBRARIES zstd - PATHS ${CMAKE_LIBRARY_PATH} - ) - if(ZSTD_INCLUDE_DIR AND ZSTD_LIBRARIES) - set(ZSTD_FOUND TRUE) - endif() - endif() -endif() - -if(NOT ZSTD_FOUND) - message(FATAL_ERROR "ZSTD not found. Please install zstd or set CMAKE_PREFIX_PATH to point to user installation.") -endif() - +# ZSTD dependency +find_package(ZSTD REQUIRED) message(STATUS "ZSTD_INCLUDE_DIR: ${ZSTD_INCLUDE_DIR}, ZSTD_LIBRARIES: ${ZSTD_LIBRARIES}") -include_directories(${ZSTD_INCLUDE_DIR}) -if(ZSTD_LIBRARY_DIRS) - link_directories(${ZSTD_LIBRARY_DIRS}) +if("${ZSTD_LIBRARIES}" STREQUAL "") + message(FATAL_ERROR "zstd not found") endif() +include_directories(${ZSTD_INCLUDE_DIR}) +link_directories(${ZSTD_LIBRARY_DIRS}) list(APPEND required_libs ${ZSTD_LIBRARIES}) -# TCMalloc dependency (optional) -find_library(TCMALLOC_LIBRARY tcmalloc - PATHS ${CMAKE_LIBRARY_PATH} -) -if(TCMALLOC_LIBRARY) - list(APPEND optional_libs ${TCMALLOC_LIBRARY}) - message(STATUS "TCMalloc found: ${TCMALLOC_LIBRARY}") - add_compile_definitions(USE_TCMALLOC=1) -else() - message(STATUS "TCMalloc not found, using system malloc") -endif() - # Optional dependencies based on features if(ENABLE_GLCACHE) - # Try to find XGBoost - find_package(xgboost QUIET) - if(NOT xgboost_FOUND) - # Try manual find for user installation - find_path(XGBOOST_INCLUDE_DIR xgboost - PATHS ${CMAKE_INCLUDE_PATH} - ) - find_library(XGBOOST_LIBRARIES xgboost - PATHS ${CMAKE_LIBRARY_PATH} - ) - if(XGBOOST_INCLUDE_DIR AND XGBOOST_LIBRARIES) - set(xgboost_FOUND TRUE) - add_library(xgboost::xgboost UNKNOWN IMPORTED) - set_target_properties(xgboost::xgboost PROPERTIES - IMPORTED_LOCATION ${XGBOOST_LIBRARIES} - INTERFACE_INCLUDE_DIRECTORIES ${XGBOOST_INCLUDE_DIR} - ) - endif() - endif() - - if(xgboost_FOUND) - include_directories(${XGBOOST_INCLUDE_DIR}) - list(APPEND optional_libs xgboost::xgboost) - add_compile_definitions(ENABLE_GLCACHE=1) - message(STATUS "XGBOOST_INCLUDE_DIR: ${XGBOOST_INCLUDE_DIR}") - else() - message(WARNING "XGBoost not found, disabling GLCACHE feature") - set(ENABLE_GLCACHE OFF) - endif() + find_package(xgboost REQUIRED) + include_directories(${XGBOOST_INCLUDE_DIR}) + list(APPEND optional_libs xgboost::xgboost) + add_compile_definitions(ENABLE_GLCACHE=1) + message(STATUS "XGBOOST_INCLUDE_DIR: ${XGBOOST_INCLUDE_DIR}") endif() # LightGBM for LRB and 3L_CACHE @@ -285,30 +201,22 @@ foreach(FEATURE ${LIGHTGBM_FEATURES}) endforeach() if(LIGHTGBM_NEEDED) - # Try to find LightGBM if(NOT DEFINED LIGHTGBM_PATH) - find_path(LIGHTGBM_PATH LightGBM - PATHS ${CMAKE_INCLUDE_PATH} - ) - endif() - - if(NOT DEFINED LIGHTGBM_LIB) - find_library(LIGHTGBM_LIB _lightgbm - PATHS ${CMAKE_LIBRARY_PATH} - ) + find_path(LIGHTGBM_PATH LightGBM) endif() - if(NOT LIGHTGBM_PATH) - message(FATAL_ERROR "LIGHTGBM_PATH not found. Please install LightGBM or set CMAKE_PREFIX_PATH.") + message(FATAL_ERROR "LIGHTGBM_PATH not found") endif() + if(NOT DEFINED LIGHTGBM_LIB) + find_library(LIGHTGBM_LIB _lightgbm) + endif() if(NOT LIGHTGBM_LIB) - message(FATAL_ERROR "LIGHTGBM_LIB not found. Please install LightGBM or set CMAKE_PREFIX_PATH.") + message(FATAL_ERROR "LIGHTGBM_LIB not found") endif() include_directories(${LIGHTGBM_PATH}) list(APPEND optional_libs ${LIGHTGBM_LIB}) - message(STATUS "LightGBM found: ${LIGHTGBM_PATH}, ${LIGHTGBM_LIB}") endif() # ============================================================================= @@ -411,4 +319,4 @@ configure_platform_specific_linking(libcachesim_python) # Installation # ============================================================================= -install(TARGETS libcachesim_python LIBRARY DESTINATION libcachesim) +install(TARGETS libcachesim_python LIBRARY DESTINATION libcachesim) \ No newline at end of file diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index f2686f3..481de58 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -9,17 +9,24 @@ docs_dir: src nav: - Home: - libCacheSim Python: index.md - - Getting Started: - - Quick Start: quickstart.md + - Getting Started: + - getting_started/quickstart.md + - getting_started/installation.md + - Examples: + - examples/simulation.md + - examples/analysis.md + - examples/plugins.md + - User Guide: + - FAQ: faq.md + - Developer Guide: + - General: developer.md - API Reference: - API Documentation: api.md - - Examples: - - Usage Examples: examples.md theme: name: material - logo: assets/logos/logo-only-light.ico - favicon: assets/logos/logo-only-light.ico + # logo: assets/logos/logo-only-light.ico + # favicon: assets/logos/logo-only-light.ico language: en palette: # Palette toggle for automatic mode @@ -77,7 +84,9 @@ plugins: build: true nav_translations: Home: 首页 - Quick Start: 快速开始 + Getting Started: 快速开始 + User Guide: 用户指南 + Developer Guide: 开发者指南 API Reference: API参考 Examples: 使用示例 diff --git a/docs/src/assets/logos/logo.jpg b/docs/src/assets/logos/logo.jpg new file mode 100644 index 0000000..779fe2a Binary files /dev/null and b/docs/src/assets/logos/logo.jpg differ diff --git a/docs/src/en/api.md b/docs/src/en/api.md index b3c4a68..8c3fc1b 100644 --- a/docs/src/en/api.md +++ b/docs/src/en/api.md @@ -1,395 +1,3 @@ # API Reference -This page provides detailed API documentation for the libCacheSim Python bindings. - -## Core Classes - -### Cache Classes - -All cache classes inherit from the base cache interface and provide the following methods: - -```python -class Cache: - """Base cache interface.""" - - def get(self, obj_id: int, obj_size: int = 1) -> bool: - """Request an object from the cache. - - Args: - obj_id: Object identifier - obj_size: Object size in bytes - - Returns: - True if cache hit, False if cache miss - """ - - def get_hit_ratio(self) -> float: - """Get the current cache hit ratio.""" - - def get_miss_ratio(self) -> float: - """Get the current cache miss ratio.""" - - def get_num_hits(self) -> int: - """Get the total number of cache hits.""" - - def get_num_misses(self) -> int: - """Get the total number of cache misses.""" -``` - -### Available Cache Algorithms - -```python -# Basic algorithms -def LRU(cache_size: int) -> Cache: ... -def LFU(cache_size: int) -> Cache: ... -def FIFO(cache_size: int) -> Cache: ... -def Clock(cache_size: int) -> Cache: ... -def Random(cache_size: int) -> Cache: ... - -# Advanced algorithms -def ARC(cache_size: int) -> Cache: ... -def S3FIFO(cache_size: int) -> Cache: ... -def Sieve(cache_size: int) -> Cache: ... -def TinyLFU(cache_size: int) -> Cache: ... -def TwoQ(cache_size: int) -> Cache: ... -```ence - -This page provides detailed API documentation for libCacheSim Python bindings. - -## Core Classes - -### Cache Classes - -All cache classes inherit from the base cache interface and provide the following methods: - -::: libcachesim.cache - -### TraceReader - -```python -class TraceReader: - """Read trace files in various formats.""" - - def __init__(self, trace_path: str, trace_type: TraceType, - reader_params: ReaderInitParam = None): - """Initialize trace reader. - - Args: - trace_path: Path to trace file - trace_type: Type of trace format - reader_params: Optional reader configuration - """ - - def __iter__(self): - """Iterate over requests in the trace.""" - - def reset(self): - """Reset reader to beginning of trace.""" - - def skip(self, n: int): - """Skip n requests.""" - - def clone(self): - """Create a copy of the reader.""" -``` - -### SyntheticReader - -```python -class SyntheticReader: - """Generate synthetic workloads.""" - - def __init__(self, num_objects: int, num_requests: int, - distribution: str = "zipf", alpha: float = 1.0, - obj_size: int = 1, seed: int = None): - """Initialize synthetic reader. - - Args: - num_objects: Number of unique objects - num_requests: Total requests to generate - distribution: Distribution type ("zipf", "uniform") - alpha: Zipf skewness parameter - obj_size: Object size in bytes - seed: Random seed for reproducibility - """ -``` - -### TraceAnalyzer - -```python -class TraceAnalyzer: - """Analyze trace characteristics.""" - - def __init__(self, trace_path: str, trace_type: TraceType, - reader_params: ReaderInitParam = None): - """Initialize trace analyzer.""" - - def get_num_requests(self) -> int: - """Get total number of requests.""" - - def get_num_objects(self) -> int: - """Get number of unique objects.""" - - def get_working_set_size(self) -> int: - """Get working set size.""" -``` - -## Enumerations and Constants - -### TraceType - -```python -class TraceType: - """Supported trace file formats.""" - CSV_TRACE = "csv" - BINARY_TRACE = "binary" - ORACLE_GENERAL_TRACE = "oracle" - PLAIN_TXT_TRACE = "txt" -``` - -### SamplerType - -```python -class SamplerType: - """Sampling strategies.""" - SPATIAL_SAMPLER = "spatial" - TEMPORAL_SAMPLER = "temporal" -``` - -### ReqOp - -```python -class ReqOp: - """Request operation types.""" - READ = "read" - WRITE = "write" - DELETE = "delete" -``` - -## Data Structures - -### Request - -```python -class Request: - """Represents a cache request.""" - - def __init__(self): - self.obj_id: int = 0 - self.obj_size: int = 1 - self.timestamp: int = 0 - self.op: str = "read" -``` - -### ReaderInitParam - -```python -class ReaderInitParam: - """Configuration parameters for trace readers.""" - - def __init__(self): - self.has_header: bool = False - self.delimiter: str = "," - self.obj_id_is_num: bool = True - self.ignore_obj_size: bool = False - self.ignore_size_zero_req: bool = True - self.cap_at_n_req: int = -1 - self.block_size: int = 4096 - self.trace_start_offset: int = 0 - - # Field mappings (1-indexed) - self.time_field: int = 1 - self.obj_id_field: int = 2 - self.obj_size_field: int = 3 - self.op_field: int = 4 - - self.sampler: Sampler = None -``` - -### Sampler - -```python -class Sampler: - """Configuration for request sampling.""" - - def __init__(self, sample_ratio: float = 1.0, - type: str = "spatial"): - """Initialize sampler. - - Args: - sample_ratio: Fraction of requests to sample (0.0-1.0) - type: Sampling type ("spatial" or "temporal") - """ - self.sample_ratio = sample_ratio - self.type = type -``` - -## Utility Functions - -### Synthetic Trace Generation - -```python -def create_zipf_requests(num_objects, num_requests, alpha, obj_size, seed=None): - """ - Create Zipf-distributed synthetic requests. - - Args: - num_objects (int): Number of unique objects - num_requests (int): Total number of requests to generate - alpha (float): Zipf skewness parameter (higher = more skewed) - obj_size (int): Size of each object in bytes - seed (int, optional): Random seed for reproducibility - - Returns: - List[Request]: List of generated requests - """ - -def create_uniform_requests(num_objects, num_requests, obj_size, seed=None): - """ - Create uniformly-distributed synthetic requests. - - Args: - num_objects (int): Number of unique objects - num_requests (int): Total number of requests to generate - obj_size (int): Size of each object in bytes - seed (int, optional): Random seed for reproducibility - - Returns: - List[Request]: List of generated requests - """ -``` - -### Cache Algorithms - -Available cache algorithms with their factory functions: - -```python -# Basic algorithms -LRU(cache_size: int) -> Cache -LFU(cache_size: int) -> Cache -FIFO(cache_size: int) -> Cache -Clock(cache_size: int) -> Cache -Random(cache_size: int) -> Cache - -# Advanced algorithms -ARC(cache_size: int) -> Cache -S3FIFO(cache_size: int) -> Cache -Sieve(cache_size: int) -> Cache -TinyLFU(cache_size: int) -> Cache -TwoQ(cache_size: int) -> Cache -LRB(cache_size: int) -> Cache - -# Experimental algorithms -cache_3L(cache_size: int) -> Cache -``` - -### Performance Metrics - -```python -class CacheStats: - """Cache performance statistics.""" - - def __init__(self): - self.hits = 0 - self.misses = 0 - self.evictions = 0 - self.bytes_written = 0 - self.bytes_read = 0 - - @property - def hit_ratio(self) -> float: - """Calculate hit ratio.""" - total = self.hits + self.misses - return self.hits / total if total > 0 else 0.0 - - @property - def miss_ratio(self) -> float: - """Calculate miss ratio.""" - return 1.0 - self.hit_ratio -``` - -## Error Handling - -The library uses standard Python exceptions: - -- `ValueError`: Invalid parameters or configuration -- `FileNotFoundError`: Trace file not found -- `RuntimeError`: Runtime errors from underlying C++ library -- `MemoryError`: Out of memory conditions - -Example error handling: - -```python -try: - reader = lcs.TraceReader("nonexistent.csv", lcs.TraceType.CSV_TRACE) -except FileNotFoundError: - print("Trace file not found") -except ValueError as e: - print(f"Invalid configuration: {e}") -``` - -## Configuration Options - -### Reader Configuration - -```python -reader_params = lcs.ReaderInitParam( - has_header=True, # CSV has header row - delimiter=",", # Field delimiter - obj_id_is_num=True, # Object IDs are numeric - ignore_obj_size=False, # Don't ignore object sizes - ignore_size_zero_req=True, # Ignore zero-size requests - cap_at_n_req=1000000, # Limit number of requests - block_size=4096, # Block size for block-based traces - trace_start_offset=0, # Skip initial requests -) - -# Field mappings (1-indexed) -reader_params.time_field = 1 -reader_params.obj_id_field = 2 -reader_params.obj_size_field = 3 -reader_params.op_field = 4 -``` - -### Sampling Configuration - -```python -sampler = lcs.Sampler( - sample_ratio=0.1, # Sample 10% of requests - type=lcs.SamplerType.SPATIAL_SAMPLER # Spatial sampling -) -reader_params.sampler = sampler -``` - -## Thread Safety - -The library provides thread-safe operations for most use cases: - -- Cache operations are thread-safe within a single cache instance -- Multiple readers can be used concurrently -- Analysis operations can utilize multiple threads - -For high-concurrency scenarios, consider using separate cache instances per thread. - -## Memory Management - -The library automatically manages memory for most operations: - -- Cache objects handle their own memory allocation -- Trace readers manage buffering automatically -- Request objects are lightweight and reusable - -For large-scale simulations, monitor memory usage and consider: - -- Using sampling to reduce trace size -- Processing traces in chunks -- Limiting cache sizes appropriately - -## Best Practices - -1. **Use appropriate cache sizes**: Size caches based on your simulation goals -2. **Set random seeds**: For reproducible results in synthetic traces -3. **Handle errors**: Always wrap file operations in try-catch blocks -4. **Monitor memory**: For large traces, consider sampling or chunking -5. **Use threading**: Leverage multi-threading for analysis tasks -6. **Validate traces**: Check trace format and content before simulation +[TBD] \ No newline at end of file diff --git a/docs/src/en/developer.md b/docs/src/en/developer.md new file mode 100644 index 0000000..8fcc019 --- /dev/null +++ b/docs/src/en/developer.md @@ -0,0 +1,3 @@ +# Developer Guide + +[TBD] \ No newline at end of file diff --git a/docs/src/en/examples.md b/docs/src/en/examples.md deleted file mode 100644 index 0d56aa9..0000000 --- a/docs/src/en/examples.md +++ /dev/null @@ -1,501 +0,0 @@ -# Examples - -This page provides practical examples of using libCacheSim Python bindings for various cache simulation scenarios. - -## Basic Cache Simulation - -### Simple LRU Cache Example - -```python -import libcachesim as lcs - -# Create an LRU cache with 1MB capacity -cache = lcs.LRU(cache_size=1024*1024) - -# Generate synthetic Zipf trace -reader = lcs.SyntheticReader( - num_of_req=10000, - obj_size=1024, - dist="zipf", - alpha=1.0, - num_objects=1000, - seed=42 -) - -# Simulate cache behavior -hits = 0 -total = 0 - -for req in reader: - if cache.get(req): - hits += 1 - total += 1 - -print(f"Hit ratio: {hits/total:.4f}") -print(f"Total requests: {total}") -``` - -### Comparing Multiple Cache Algorithms - -```python -import libcachesim as lcs - -def compare_algorithms(trace_file, cache_size): - """Compare hit ratios of different cache algorithms.""" - - algorithms = { - "LRU": lcs.LRU, - "LFU": lcs.LFU, - "FIFO": lcs.FIFO, - "Clock": lcs.Clock, - "ARC": lcs.ARC, - "S3FIFO": lcs.S3FIFO - } - - results = {} - - for name, cache_class in algorithms.items(): - # Create fresh reader for each algorithm - reader = lcs.SyntheticReader( - num_of_req=10000, - obj_size=1024, - dist="zipf", - alpha=1.0, - seed=42 # Same seed for fair comparison - ) - - cache = cache_class(cache_size=cache_size) - hits = 0 - - for req in reader: - if cache.get(req): - hits += 1 - - hit_ratio = hits / reader.get_num_of_req() - results[name] = hit_ratio - print(f"{name:8}: {hit_ratio:.4f}") - - return results - -# Compare with 64KB cache -results = compare_algorithms("trace.csv", 64*1024) -``` - -## Working with Real Traces - -### Reading CSV Traces - -```python -import libcachesim as lcs - -def simulate_csv_trace(csv_file): - """Simulate cache behavior on CSV trace.""" - - # Configure CSV reader - reader_params = lcs.ReaderInitParam( - has_header=True, - delimiter=",", - obj_id_is_num=True - ) - - # Set field mappings (1-indexed) - reader_params.time_field = 1 - reader_params.obj_id_field = 2 - reader_params.obj_size_field = 3 - reader_params.op_field = 4 - - reader = lcs.TraceReader( - trace=csv_file, - trace_type=lcs.TraceType.CSV_TRACE, - reader_init_params=reader_params - ) - - print(f"Loaded trace with {reader.get_num_of_req()} requests") - - # Test different cache sizes - cache_sizes = [1024*1024*i for i in [1, 2, 4, 8, 16]] # 1MB to 16MB - - for size in cache_sizes: - cache = lcs.LRU(cache_size=size) - reader.reset() # Reset to beginning - - hits = 0 - for req in reader: - if cache.get(req): - hits += 1 - - hit_ratio = hits / reader.get_num_of_req() - print(f"Cache size: {size//1024//1024}MB, Hit ratio: {hit_ratio:.4f}") - -# Usage -simulate_csv_trace("workload.csv") -``` - -### Handling Large Traces with Sampling - -```python -import libcachesim as lcs - -def analyze_large_trace(trace_file, sample_ratio=0.1): - """Analyze large trace using sampling.""" - - # Create sampler - sampler = lcs.Sampler( - sample_ratio=sample_ratio, - type=lcs.SamplerType.SPATIAL_SAMPLER - ) - - reader_params = lcs.ReaderInitParam( - has_header=True, - delimiter=",", - obj_id_is_num=True - ) - reader_params.sampler = sampler - - reader = lcs.TraceReader( - trace=trace_file, - trace_type=lcs.TraceType.CSV_TRACE, - reader_init_params=reader_params - ) - - print(f"Sampling {sample_ratio*100}% of trace") - print(f"Sampled requests: {reader.get_num_of_req()}") - - # Run simulation on sampled trace - cache = lcs.LRU(cache_size=10*1024*1024) # 10MB - hits = 0 - - for req in reader: - if cache.get(req): - hits += 1 - - hit_ratio = hits / reader.get_num_of_req() - print(f"Hit ratio on sampled trace: {hit_ratio:.4f}") - -# Sample 5% of a large trace -analyze_large_trace("large_trace.csv", sample_ratio=0.05) -``` - -## Advanced Analysis - -### Comprehensive Trace Analysis - -```python -import libcachesim as lcs -import os - -def comprehensive_analysis(trace_file, output_dir="analysis_results"): - """Run comprehensive trace analysis.""" - - # Create output directory - os.makedirs(output_dir, exist_ok=True) - - # Load trace - reader = lcs.TraceReader(trace_file, lcs.TraceType.CSV_TRACE) - - # Run trace analysis - analyzer = lcs.TraceAnalyzer(reader, f"{output_dir}/trace_analysis") - print("Running trace analysis...") - analyzer.run() - - print(f"Analysis complete. Results saved to {output_dir}/") - print("Generated files:") - for file in os.listdir(output_dir): - print(f" - {file}") - -# Run analysis -comprehensive_analysis("workload.csv") -``` - -### Hit Ratio Curves - -```python -import libcachesim as lcs -import matplotlib.pyplot as plt - -def plot_hit_ratio_curve(trace_file, algorithms=None): - """Plot hit ratio curves for different algorithms.""" - - if algorithms is None: - algorithms = ["LRU", "LFU", "FIFO", "ARC"] - - # Cache sizes from 1MB to 100MB - cache_sizes = [1024*1024*i for i in range(1, 101, 5)] - - plt.figure(figsize=(10, 6)) - - for algo_name in algorithms: - hit_ratios = [] - - for cache_size in cache_sizes: - reader = lcs.SyntheticReader( - num_of_req=5000, - obj_size=1024, - dist="zipf", - alpha=1.0, - seed=42 - ) - - cache = getattr(lcs, algo_name)(cache_size=cache_size) - hits = 0 - - for req in reader: - if cache.get(req): - hits += 1 - - hit_ratio = hits / reader.get_num_of_req() - hit_ratios.append(hit_ratio) - - # Convert to MB for plotting - sizes_mb = [size // 1024 // 1024 for size in cache_sizes] - plt.plot(sizes_mb, hit_ratios, label=algo_name, marker='o') - - plt.xlabel('Cache Size (MB)') - plt.ylabel('Hit Ratio') - plt.title('Hit Ratio vs Cache Size') - plt.legend() - plt.grid(True, alpha=0.3) - plt.show() - -# Generate hit ratio curves -plot_hit_ratio_curve("trace.csv") -``` - -## Custom Cache Policies - -### Implementing a Custom LRU with Python Hooks - -```python -import libcachesim as lcs -from collections import OrderedDict - -def create_python_lru(cache_size): - """Create a custom LRU cache using Python hooks.""" - - def init_hook(size): - """Initialize cache data structure.""" - return { - 'data': OrderedDict(), - 'size': 0, - 'capacity': size - } - - def hit_hook(cache_dict, obj_id, obj_size): - """Handle cache hit.""" - # Move to end (most recently used) - cache_dict['data'].move_to_end(obj_id) - - def miss_hook(cache_dict, obj_id, obj_size): - """Handle cache miss.""" - # Add new item - cache_dict['data'][obj_id] = obj_size - cache_dict['size'] += obj_size - - def eviction_hook(cache_dict, obj_id, obj_size): - """Handle eviction when cache is full.""" - # Remove least recently used items - while cache_dict['size'] + obj_size > cache_dict['capacity']: - if not cache_dict['data']: - break - lru_id, lru_size = cache_dict['data'].popitem(last=False) - cache_dict['size'] -= lru_size - - return lcs.PythonHookCache( - cache_size=cache_size, - init_hook=init_hook, - hit_hook=hit_hook, - miss_hook=miss_hook, - eviction_hook=eviction_hook - ) - -# Test custom LRU -custom_cache = create_python_lru(1024*1024) -reader = lcs.SyntheticReader(num_of_req=1000, obj_size=1024) - -hits = 0 -for req in reader: - if custom_cache.get(req): - hits += 1 - -print(f"Custom LRU hit ratio: {hits/1000:.4f}") -``` - -### Time-based Cache with TTL - -```python -import libcachesim as lcs -import time - -def create_ttl_cache(cache_size, ttl_seconds=300): - """Create a cache with time-to-live (TTL) expiration.""" - - def init_hook(size): - return { - 'data': {}, - 'timestamps': {}, - 'size': 0, - 'capacity': size, - 'ttl': ttl_seconds - } - - def is_expired(cache_dict, obj_id): - """Check if object has expired.""" - if obj_id not in cache_dict['timestamps']: - return True - return time.time() - cache_dict['timestamps'][obj_id] > cache_dict['ttl'] - - def hit_hook(cache_dict, obj_id, obj_size): - """Handle cache hit.""" - if is_expired(cache_dict, obj_id): - # Expired, treat as miss - if obj_id in cache_dict['data']: - del cache_dict['data'][obj_id] - del cache_dict['timestamps'][obj_id] - cache_dict['size'] -= obj_size - return False - return True - - def miss_hook(cache_dict, obj_id, obj_size): - """Handle cache miss.""" - current_time = time.time() - cache_dict['data'][obj_id] = obj_size - cache_dict['timestamps'][obj_id] = current_time - cache_dict['size'] += obj_size - - def eviction_hook(cache_dict, obj_id, obj_size): - """Handle eviction.""" - # First try to evict expired items - current_time = time.time() - expired_items = [] - - for oid, timestamp in cache_dict['timestamps'].items(): - if current_time - timestamp > cache_dict['ttl']: - expired_items.append(oid) - - for oid in expired_items: - if oid in cache_dict['data']: - cache_dict['size'] -= cache_dict['data'][oid] - del cache_dict['data'][oid] - del cache_dict['timestamps'][oid] - - # If still need space, evict oldest items - while cache_dict['size'] + obj_size > cache_dict['capacity']: - if not cache_dict['data']: - break - # Find oldest item - oldest_id = min(cache_dict['timestamps'].keys(), - key=lambda x: cache_dict['timestamps'][x]) - cache_dict['size'] -= cache_dict['data'][oldest_id] - del cache_dict['data'][oldest_id] - del cache_dict['timestamps'][oldest_id] - - return lcs.PythonHookCache( - cache_size=cache_size, - init_hook=init_hook, - hit_hook=hit_hook, - miss_hook=miss_hook, - eviction_hook=eviction_hook - ) - -# Test TTL cache -ttl_cache = create_ttl_cache(1024*1024, ttl_seconds=60) -``` - -## Performance Optimization - -### Batch Processing for Large Workloads - -```python -import libcachesim as lcs - -def batch_simulation(trace_file, batch_size=10000): - """Process large traces in batches to optimize memory usage.""" - - reader = lcs.TraceReader(trace_file, lcs.TraceType.CSV_TRACE) - cache = lcs.LRU(cache_size=10*1024*1024) - - total_requests = 0 - total_hits = 0 - batch_count = 0 - - while True: - batch_hits = 0 - batch_requests = 0 - - # Process a batch of requests - for _ in range(batch_size): - try: - req = reader.read_one_req() - if req.valid: - if cache.get(req): - batch_hits += 1 - batch_requests += 1 - else: - break # End of trace - except: - break - - if batch_requests == 0: - break - - total_hits += batch_hits - total_requests += batch_requests - batch_count += 1 - - # Print progress - hit_ratio = batch_hits / batch_requests - print(f"Batch {batch_count}: {batch_requests} requests, " - f"hit ratio: {hit_ratio:.4f}") - - overall_hit_ratio = total_hits / total_requests - print(f"Overall: {total_requests} requests, hit ratio: {overall_hit_ratio:.4f}") - -# Process in batches -batch_simulation("large_trace.csv", batch_size=50000) -``` - -### Multi-threaded Analysis - -```python -import libcachesim as lcs -import concurrent.futures -import threading - -def parallel_cache_comparison(trace_file, algorithms, cache_size): - """Compare cache algorithms in parallel.""" - - def simulate_algorithm(algo_name): - """Simulate single algorithm.""" - reader = lcs.TraceReader(trace_file, lcs.TraceType.CSV_TRACE) - cache = getattr(lcs, algo_name)(cache_size=cache_size) - - hits = 0 - total = 0 - - for req in reader: - if cache.get(req): - hits += 1 - total += 1 - - hit_ratio = hits / total if total > 0 else 0 - return algo_name, hit_ratio - - # Run simulations in parallel - with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor: - futures = {executor.submit(simulate_algorithm, algo): algo - for algo in algorithms} - - results = {} - for future in concurrent.futures.as_completed(futures): - algo_name, hit_ratio = future.result() - results[algo_name] = hit_ratio - print(f"{algo_name}: {hit_ratio:.4f}") - - return results - -# Compare algorithms in parallel -algorithms = ["LRU", "LFU", "FIFO", "ARC", "S3FIFO"] -results = parallel_cache_comparison("trace.csv", algorithms, 1024*1024) -``` - -These examples demonstrate the versatility and power of libCacheSim Python bindings for cache simulation, analysis, and research. You can modify and extend these examples for your specific use cases. diff --git a/docs/src/en/examples/analysis.md b/docs/src/en/examples/analysis.md new file mode 100644 index 0000000..ccdcb6f --- /dev/null +++ b/docs/src/en/examples/analysis.md @@ -0,0 +1,3 @@ +# Trace Analysis + +[TBD] \ No newline at end of file diff --git a/docs/src/en/plugin.md b/docs/src/en/examples/plugins.md similarity index 100% rename from docs/src/en/plugin.md rename to docs/src/en/examples/plugins.md diff --git a/docs/src/en/examples/simulation.md b/docs/src/en/examples/simulation.md new file mode 100644 index 0000000..03d5e76 --- /dev/null +++ b/docs/src/en/examples/simulation.md @@ -0,0 +1,3 @@ +# Cache Simulation + +[TBD] \ No newline at end of file diff --git a/docs/src/en/faq.md b/docs/src/en/faq.md new file mode 100644 index 0000000..dd82326 --- /dev/null +++ b/docs/src/en/faq.md @@ -0,0 +1,5 @@ +# Frequently Asked Questions + +1. How to resolve when pip install fails? + +See [installation](https://cachemon.github.io/libCacheSim-python/getting_started/installation/). \ No newline at end of file diff --git a/docs/src/en/getting_started/installation.md b/docs/src/en/getting_started/installation.md new file mode 100644 index 0000000..7e0f4ef --- /dev/null +++ b/docs/src/en/getting_started/installation.md @@ -0,0 +1,3 @@ +# Installation + +[TBD] \ No newline at end of file diff --git a/docs/src/en/getting_started/quickstart.md b/docs/src/en/getting_started/quickstart.md new file mode 100644 index 0000000..b913a9d --- /dev/null +++ b/docs/src/en/getting_started/quickstart.md @@ -0,0 +1,205 @@ +# Quickstart + +This guide will help you get started with libCacheSim. + +## Prerequisites + +- OS: Linux / macOS +- Python: 3.9 -- 3.13 + +## Installation + +You can install libCacheSim using [pip](https://pypi.org/project/libcachesim/) directly. + +It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment and install libCacheSim using the following commands: + +```bash +uv venv --python 3.12 --seed +source .venv/bin/activate +uv pip install libcachesim +``` + +For users who want to run LRB, ThreeLCache, and GLCache eviction algorithms: + +!!! important + if `uv` cannot find built wheels for your machine, the building system will skip these algorithms by default. + +To enable them, you need to install all third-party dependencies first. + +!!! note + To install all dependencies, you can use these scripts provided. + ```bash + git clone https://github.com/cacheMon/libCacheSim-python.git + cd libCacheSim-python + bash scripts/install_deps.sh + + # If you cannot install software directly (e.g., no sudo access) + bash scripts/install_deps_user.sh + ``` + +Then, you can reinstall libcachesim using the following commands: + +```bash +# Enable LRB +CMAKE_ARGS="-DENABLE_LRB=ON" uv pip install libcachesim +# Enable ThreeLCache +CMAKE_ARGS="-DENABLE_3L_CACHE=ON" uv pip install libcachesim +# Enable GLCache +CMAKE_ARGS="-DENABLE_GLCACHE=ON" uv pip install libcachesim +``` + +## Cache Simulation + +With libcachesim installed, you can start cache simulation for some eviction algorithm and cache traces. See the example script: + +??? code + ```python + import libcachesim as lcs + + # Step 1: Get one trace from S3 bucket + URI = "cache_dataset_oracleGeneral/2007_msr/msr_hm_0.oracleGeneral.zst" + dl = lcs.DataLoader() + dl.load(URI) + + # Step 2: Open trace and process efficiently + reader = lcs.TraceReader( + trace = dl.get_cache_path(URI), + trace_type = lcs.TraceType.ORACLE_GENERAL_TRACE, + reader_init_params = lcs.ReaderInitParam(ignore_obj_size=False) + ) + + # Step 3: Initialize cache + cache = lcs.S3FIFO(cache_size=1024*1024) + + # Step 4: Process entire trace efficiently (C++ backend) + obj_miss_ratio, byte_miss_ratio = cache.process_trace(reader) + print(f"Object miss ratio: {obj_miss_ratio:.4f}, Byte miss ratio: {byte_miss_ratio:.4f}") + + # Step 4.1: Process with limited number of requests + cache = lcs.S3FIFO(cache_size=1024*1024) + obj_miss_ratio, byte_miss_ratio = cache.process_trace( + reader, + start_req=0, + max_req=1000 + ) + print(f"Object miss ratio: {obj_miss_ratio:.4f}, Byte miss ratio: {byte_miss_ratio:.4f}") + ``` + +The above example demonstrates the basic workflow of using `libcachesim` for cache simulation: + +1. Use `DataLoader` to download a cache trace file from an S3 bucket. +2. Open and efficiently process the trace file with `TraceReader`. +3. Initialize a cache object (here, `S3FIFO`) with a specified cache size (e.g., 1MB). +4. Run the simulation on the entire trace using `process_trace` to obtain object and byte miss ratios. +5. Optionally, process only a portion of the trace by specifying `start_req` and `max_req` for partial simulation. + +This workflow applies to most cache algorithms and trace types, making it easy to get started and customize your experiments. + +## Trace Analysis + +Here is an example demonstrating how to use `TraceAnalyzer`. + +??? code + ```python + import libcachesim as lcs + + # Step 1: Get one trace from S3 bucket + URI = "cache_dataset_oracleGeneral/2007_msr/msr_hm_0.oracleGeneral.zst" + dl = lcs.DataLoader() + dl.load(URI) + + reader = lcs.TraceReader( + trace = dl.get_cache_path(URI), + trace_type = lcs.TraceType.ORACLE_GENERAL_TRACE, + reader_init_params = lcs.ReaderInitParam(ignore_obj_size=False) + ) + + analysis_option = lcs.AnalysisOption( + req_rate=True, # Keep basic request rate analysis + access_pattern=False, # Disable access pattern analysis + size=True, # Keep size analysis + reuse=False, # Disable reuse analysis for small datasets + popularity=False, # Disable popularity analysis for small datasets (< 200 objects) + ttl=False, # Disable TTL analysis + popularity_decay=False, # Disable popularity decay analysis + lifetime=False, # Disable lifetime analysis + create_future_reuse_ccdf=False, # Disable experimental features + prob_at_age=False, # Disable experimental features + size_change=False, # Disable size change analysis + ) + + analysis_param = lcs.AnalysisParam() + + analyzer = lcs.TraceAnalyzer( + reader, "example_analysis", analysis_option=analysis_option, analysis_param=analysis_param + ) + + analyzer.run() + ``` + +The above code demonstrates how to perform trace analysis using `libcachesim`. The workflow is as follows: + +1. Download a trace file from an S3 bucket using `DataLoader`. +2. Open the trace file with `TraceReader`, specifying the trace type and any reader initialization parameters. +3. Configure the analysis options with `AnalysisOption` to enable or disable specific analyses (such as request rate, size, etc.). +4. Optionally, set additional analysis parameters with `AnalysisParam`. +5. Create a `TraceAnalyzer` object with the reader, output directory, and the chosen options and parameters. +6. Run the analysis with `analyzer.run()`. + +After running, you can access the analysis results, such as summary statistics (`stat`) or detailed results (e.g., `example_analysis.size`). + +## Plugin System + +libCacheSim also allows user to develop their own cache eviction algorithms and test them via the plugin system. + +Here is an example of implement `LRU` via the plugin system. + +??? code + ```python + from collections import OrderedDict + from typing import Any + + from libcachesim import PluginCache, LRU, CommonCacheParams, Request + + def init_hook(_: CommonCacheParams) -> Any: + return OrderedDict() + + def hit_hook(data: Any, req: Request) -> None: + data.move_to_end(req.obj_id, last=True) + + def miss_hook(data: Any, req: Request) -> None: + data.__setitem__(req.obj_id, req.obj_size) + + def eviction_hook(data: Any, _: Request) -> int: + return data.popitem(last=False)[0] + + def remove_hook(data: Any, obj_id: int) -> None: + data.pop(obj_id, None) + + def free_hook(data: Any) -> None: + data.clear() + + + plugin_lru_cache = PluginCache( + cache_size=128, + cache_init_hook=init_hook, + cache_hit_hook=hit_hook, + cache_miss_hook=miss_hook, + cache_eviction_hook=eviction_hook, + cache_remove_hook=remove_hook, + cache_free_hook=free_hook, + cache_name="Plugin_LRU", + ) + + reader = lcs.SyntheticReader(num_objects=1000, num_of_req=10000, obj_size=1) + req_miss_ratio, byte_miss_ratio = plugin_lru_cache.process_trace(reader) + ref_req_miss_ratio, ref_byte_miss_ratio = LRU(128).process_trace(reader) + print(f"plugin req miss ratio {req_miss_ratio}, ref req miss ratio {ref_req_miss_ratio}") + print(f"plugin byte miss ratio {byte_miss_ratio}, ref byte miss ratio {ref_byte_miss_ratio}") + ``` + +By defining custom hook functions for cache initialization, hit, miss, eviction, removal, and cleanup, users can easily prototype and test their own cache eviction algorithms. + + + + diff --git a/docs/src/en/index.md b/docs/src/en/index.md index 2eba51f..fbf84ae 100644 --- a/docs/src/en/index.md +++ b/docs/src/en/index.md @@ -1,68 +1,35 @@ -# libCacheSim Python Bindings +# Welcome to libCacheSim Python -Welcome to libCacheSim Python bindings! This is a high-performance cache simulation library with Python interface. +!!! note + For convenience, we refer to the *libCacheSim Python Package* (this repo) as *libCacheSim* and the *C library* as *libCacheSim lib* in the following documentation. -## Overview +
+ ![](../assets/logos/logo.jpg){ align="center" alt="libCacheSim Light" class="logo-light" width="60%" } +
-libCacheSim is a high-performance cache simulation framework that supports various cache algorithms and trace formats. The Python bindings provide an easy-to-use interface for cache simulation, analysis, and research. +

+A high-performance library for building and running cache simulations + +

-## Key Features +

+ +Star +Watch +Fork +

-- **High Performance**: Built on top of the optimized C++ libCacheSim library -- **Multiple Cache Algorithms**: Support for LRU, LFU, FIFO, ARC, Clock, S3FIFO, Sieve, and many more -- **Trace Support**: Read various trace formats (CSV, binary, OracleGeneral, etc.) -- **Synthetic Traces**: Generate synthetic workloads with Zipf and uniform distributions -- **Analysis Tools**: Built-in trace analysis and cache performance evaluation -- **Easy Integration**: Simple Python API for research and production use +libCacheSim is an easy-to-use python binding of [libCachesim lib](https://github.com/1a1a11a/libCacheSim) for building and running cache simulations. -## Quick Example +libCacheSim is fast with the features from [underlying libCacheSim lib](https://github.com/1a1a11a/libCacheSim): -```python -import libcachesim as lcs +- High performance - over 20M requests/sec for a realistic trace replay. +- High memory efficiency - predictable and small memory footprint. +- Parallelism out-of-the-box - uses the many CPU cores to speed up trace analysis and cache simulations. -# Create a cache -cache = lcs.LRU(cache_size=1024*1024) # 1MB cache +libCacheSim is flexible and easy to use with: -# Generate synthetic trace -reader = lcs.SyntheticReader( - num_of_req=10000, - obj_size=1024, - dist="zipf", - alpha=1.0 -) - -# Simulate cache behavior -hit_count = 0 -for req in reader: - if cache.get(req): - hit_count += 1 - -hit_ratio = hit_count / reader.get_num_of_req() -print(f"Hit ratio: {hit_ratio:.4f}") -``` - -## Installation - -```bash -pip install libcachesim -``` - -Or install from source: - -```bash -git clone https://github.com/cacheMon/libCacheSim-python.git -cd libCacheSim-python -pip install -e . -``` - -## Getting Started - -Check out our [Quick Start Guide](quickstart.md) to begin using libCacheSim Python bindings, or explore the [API Reference](api.md) for detailed documentation. - -## Contributing - -We welcome contributions! Please see our [GitHub repository](https://github.com/cacheMon/libCacheSim-python) for more information. - -## License - -This project is licensed under the GPL-3.0 License. +- Seamless integration with [open-source cache dataset](https://github.com/cacheMon/cache_dataset) consisting of thousands traces hosted on S3. +- High-throughput simulation with the [underlying libCacheSim lib](https://github.com/1a1a11a/libCacheSim) +- Detailed cache requests and other internal data control +- Customized plugin cache development without any compilation \ No newline at end of file diff --git a/docs/src/en/quickstart.md b/docs/src/en/quickstart.md deleted file mode 100644 index 2e32f4d..0000000 --- a/docs/src/en/quickstart.md +++ /dev/null @@ -1,183 +0,0 @@ -# Quick Start Guide - -This guide will help you get started with libCacheSim Python bindings. - -## Installation - -### From PyPI (Recommended) - -```bash -pip install libcachesim -``` - -### From Source - -```bash -git clone https://github.com/cacheMon/libCacheSim-python.git -cd libCacheSim-python -git submodule update --init --recursive -pip install -e . -``` - -## Basic Usage - -### 1. Creating a Cache - -```python -import libcachesim as lcs - -# Create different types of caches -lru_cache = lcs.LRU(cache_size=1024*1024) # 1MB LRU cache -lfu_cache = lcs.LFU(cache_size=1024*1024) # 1MB LFU cache -fifo_cache = lcs.FIFO(cache_size=1024*1024) # 1MB FIFO cache -``` - -### 2. Using Synthetic Traces - -```python -# Generate Zipf-distributed requests -reader = lcs.SyntheticReader( - num_of_req=10000, - obj_size=1024, - dist="zipf", - alpha=1.0, - num_objects=1000, - seed=42 -) - -# Simulate cache behavior -cache = lcs.LRU(cache_size=50*1024) -hit_count = 0 - -for req in reader: - if cache.get(req): - hit_count += 1 - -print(f"Hit ratio: {hit_count/reader.get_num_of_req():.4f}") -``` - -### 3. Reading Real Traces - -```python -# Read CSV trace -reader = lcs.TraceReader( - trace="path/to/trace.csv", - trace_type=lcs.TraceType.CSV_TRACE, - has_header=True, - delimiter=",", - obj_id_is_num=True -) - -# Process requests -cache = lcs.LRU(cache_size=1024*1024) -for req in reader: - result = cache.get(req) - # Process result... -``` - -### 4. Cache Performance Analysis - -```python -# Run comprehensive analysis -analyzer = lcs.TraceAnalyzer(reader, "output_prefix") -analyzer.run() - -# This generates various analysis files: -# - Hit ratio curves -# - Access pattern analysis -# - Temporal locality analysis -# - And more... -``` - -## Available Cache Algorithms - -libCacheSim supports numerous cache algorithms: - -### Basic Algorithms -- **LRU**: Least Recently Used -- **LFU**: Least Frequently Used -- **FIFO**: First In, First Out -- **Clock**: Clock algorithm -- **Random**: Random replacement - -### Advanced Algorithms -- **ARC**: Adaptive Replacement Cache -- **S3FIFO**: Simple, Fast, Fair FIFO -- **Sieve**: Sieve eviction algorithm -- **TinyLFU**: Tiny LFU with admission control -- **TwoQ**: Two-Queue algorithm -- **LRB**: Learning Relaxed Belady - -### Experimental Algorithms -- **3LCache**: Three-Level Cache -- **And many more...** - -## Trace Formats - -Supported trace formats include: - -- **CSV**: Comma-separated values -- **Binary**: Custom binary format -- **OracleGeneral**: Oracle general format -- **Vscsi**: VMware vSCSI format -- **And more...** - -## Advanced Features - -### Custom Cache Policies - -You can implement custom cache policies using Python hooks: - -```python -from collections import OrderedDict - -def create_custom_lru(): - def init_hook(cache_size): - return OrderedDict() - - def hit_hook(cache_dict, obj_id, obj_size): - cache_dict.move_to_end(obj_id) - - def miss_hook(cache_dict, obj_id, obj_size): - cache_dict[obj_id] = obj_size - - def eviction_hook(cache_dict, obj_id, obj_size): - if cache_dict: - cache_dict.popitem(last=False) - - return lcs.PythonHookCache( - cache_size=1024*1024, - init_hook=init_hook, - hit_hook=hit_hook, - miss_hook=miss_hook, - eviction_hook=eviction_hook - ) - -custom_cache = create_custom_lru() -``` - -### Trace Sampling - -```python -# Sample 10% of requests spatially -reader = lcs.TraceReader( - trace="large_trace.csv", - trace_type=lcs.TraceType.CSV_TRACE, - sampling_ratio=0.1, - sampling_type=lcs.SamplerType.SPATIAL_SAMPLER -) -``` - -### Multi-threaded Analysis - -```python -# Use multiple threads for analysis -analyzer = lcs.TraceAnalyzer(reader, "output", n_threads=4) -analyzer.run() -``` - -## Next Steps - -- Explore the [API Reference](api.md) for detailed documentation -- Check out [Examples](examples.md) for more complex use cases -- Visit our [GitHub repository](https://github.com/cacheMon/libCacheSim-python) for source code and issues diff --git a/examples/basic_usage.py b/examples/basic_usage.py index e8dd208..2a4bd60 100644 --- a/examples/basic_usage.py +++ b/examples/basic_usage.py @@ -7,23 +7,19 @@ # Step 2: Open trace and process efficiently reader = lcs.TraceReader( - trace = dl.get_cache_path(URI), - trace_type = lcs.TraceType.ORACLE_GENERAL_TRACE, - reader_init_params = lcs.ReaderInitParam(ignore_obj_size=False) + trace=dl.get_cache_path(URI), + trace_type=lcs.TraceType.ORACLE_GENERAL_TRACE, + reader_init_params=lcs.ReaderInitParam(ignore_obj_size=False), ) # Step 3: Initialize cache -cache = lcs.S3FIFO(cache_size=1024*1024) +cache = lcs.S3FIFO(cache_size=1024 * 1024) # Step 4: Process entire trace efficiently (C++ backend) obj_miss_ratio, byte_miss_ratio = cache.process_trace(reader) print(f"Object miss ratio: {obj_miss_ratio:.4f}, Byte miss ratio: {byte_miss_ratio:.4f}") # Step 4.1: Process with limited number of requests -cache = lcs.S3FIFO(cache_size=1024*1024) -obj_miss_ratio, byte_miss_ratio = cache.process_trace( - reader, - start_req=0, - max_req=1000 -) -print(f"Object miss ratio: {obj_miss_ratio:.4f}, Byte miss ratio: {byte_miss_ratio:.4f}") \ No newline at end of file +cache = lcs.S3FIFO(cache_size=1024 * 1024) +obj_miss_ratio, byte_miss_ratio = cache.process_trace(reader, start_req=0, max_req=1000) +print(f"Object miss ratio: {obj_miss_ratio:.4f}, Byte miss ratio: {byte_miss_ratio:.4f}") diff --git a/examples/plugin_cache/s3fifo.py b/examples/plugin_cache/s3fifo.py index 1207e23..aa1fcdf 100644 --- a/examples/plugin_cache/s3fifo.py +++ b/examples/plugin_cache/s3fifo.py @@ -8,13 +8,16 @@ from collections import deque from libcachesim import PluginCache, CommonCacheParams, Request, S3FIFO, FIFO, SyntheticReader + # NOTE(haocheng): we only support ignore object size for now class StandaloneS3FIFO: - def __init__(self, - small_size_ratio: float = 0.1, - ghost_size_ratio: float = 0.9, - move_to_main_threshold: int = 2, - cache_size: int = 1024): + def __init__( + self, + small_size_ratio: float = 0.1, + ghost_size_ratio: float = 0.9, + move_to_main_threshold: int = 2, + cache_size: int = 1024, + ): self.cache_size = cache_size small_fifo_size = int(small_size_ratio * cache_size) main_fifo_size = cache_size - small_fifo_size @@ -27,15 +30,15 @@ def __init__(self, self.small_fifo = FIFO(small_fifo_size) self.main_fifo = FIFO(main_fifo_size) self.ghost_fifo = FIFO(ghost_fifo_size) - + # Frequency tracking self.freq = {} - + # Other parameters self.max_freq = 3 self.move_to_main_threshold = move_to_main_threshold - self.has_evicted = False # Mark if we start to evict, only after full we will start eviction + self.has_evicted = False # Mark if we start to evict, only after full we will start eviction self.hit_on_ghost = False def cache_hit(self, req: Request): @@ -46,7 +49,7 @@ def cache_hit(self, req: Request): if self.main_fifo.find(req, update_cache=False): self.freq[req.obj_id] += 1 - + def cache_miss(self, req: Request): if not self.hit_on_ghost: obj = self.ghost_fifo.find(req, update_cache=False) @@ -56,14 +59,13 @@ def cache_miss(self, req: Request): self.ghost_fifo.remove(req.obj_id) self.ghost_set.remove(req.obj_id) - # NOTE(haocheng): first we need to know this miss object has record in ghost or not if not self.hit_on_ghost: if req.obj_size >= self.small_fifo.cache_size: # If object is too large, we do not process it return - # If is initialization state, we need to insert to small fifo, + # If is initialization state, we need to insert to small fifo, # then we can insert to main fifo if not self.has_evicted and self.small_fifo.get_occupied_byte() >= self.small_fifo.cache_size: obj = self.main_fifo.insert(req) @@ -76,7 +78,7 @@ def cache_miss(self, req: Request): self.main_set.add(req.obj_id) self.hit_on_ghost = False self.freq[obj.obj_id] = 0 - + def cache_evict_small(self, req: Request): has_evicted = False evicted_id = None @@ -100,7 +102,7 @@ def cache_evict_small(self, req: Request): self.small_set.remove(evicted_id) assert flag, "Should be able to remove" return real_evicted_id - + def cache_evict_main(self, req: Request): has_evicted = False evicted_id = None @@ -134,15 +136,15 @@ def cache_evict(self, req: Request): self.ghost_set.remove(req.obj_id) self.has_evicted = True - cond = (self.main_fifo.get_occupied_byte() > self.main_fifo.cache_size) - if (cond or (self.small_fifo.get_occupied_byte() == 0)): + cond = self.main_fifo.get_occupied_byte() > self.main_fifo.cache_size + if cond or (self.small_fifo.get_occupied_byte() == 0): obj_id = self.cache_evict_main(req) else: obj_id = self.cache_evict_small(req) if obj_id is not None: del self.freq[obj_id] - + return obj_id def cache_remove(self, obj_id): @@ -151,28 +153,35 @@ def cache_remove(self, obj_id): removed |= self.ghost_fifo.remove(obj_id) removed |= self.main_fifo.remove(obj_id) return removed - + + def cache_init_hook(common_cache_params: CommonCacheParams): return StandaloneS3FIFO(cache_size=common_cache_params.cache_size) + def cache_hit_hook(cache, request: Request): cache.cache_hit(request) + def cache_miss_hook(cache, request: Request): cache.cache_miss(request) + def cache_eviction_hook(cache, request: Request): evicted_id = None while evicted_id is None: evicted_id = cache.cache_evict(request) return evicted_id + def cache_remove_hook(cache, obj_id): cache.cache_remove(obj_id) + def cache_free_hook(cache): pass + cache = PluginCache( cache_size=1024, cache_init_hook=cache_init_hook, @@ -181,7 +190,8 @@ def cache_free_hook(cache): cache_eviction_hook=cache_eviction_hook, cache_remove_hook=cache_remove_hook, cache_free_hook=cache_free_hook, - cache_name="S3FIFO") + cache_name="S3FIFO", +) URI = "cache_dataset_oracleGeneral/2007_msr/msr_hm_0.oracleGeneral.zst" dl = lcs.DataLoader() @@ -189,9 +199,9 @@ def cache_free_hook(cache): # Step 2: Open trace and process efficiently reader = lcs.TraceReader( - trace = dl.get_cache_path(URI), - trace_type = lcs.TraceType.ORACLE_GENERAL_TRACE, - reader_init_params = lcs.ReaderInitParam(ignore_obj_size=True) + trace=dl.get_cache_path(URI), + trace_type=lcs.TraceType.ORACLE_GENERAL_TRACE, + reader_init_params=lcs.ReaderInitParam(ignore_obj_size=True), ) ref_s3fifo = S3FIFO(cache_size=1024, small_size_ratio=0.1, ghost_size_ratio=0.9, move_to_main_threshold=2) @@ -208,4 +218,4 @@ def cache_free_hook(cache): assert req_miss_ratio == ref_req_miss_ratio assert byte_miss_ratio == ref_byte_miss_ratio -print("All requests processed successfully. Plugin cache matches reference S3FIFO cache.") \ No newline at end of file +print("All requests processed successfully. Plugin cache matches reference S3FIFO cache.") diff --git a/examples/trace_analysis.py b/examples/trace_analysis.py new file mode 100644 index 0000000..0318171 --- /dev/null +++ b/examples/trace_analysis.py @@ -0,0 +1,32 @@ +import libcachesim as lcs + +# Step 1: Get one trace from S3 bucket +URI = "cache_dataset_oracleGeneral/2007_msr/msr_hm_0.oracleGeneral.zst" +dl = lcs.DataLoader() +dl.load(URI) + +reader = lcs.TraceReader( + trace=dl.get_cache_path(URI), + trace_type=lcs.TraceType.ORACLE_GENERAL_TRACE, + reader_init_params=lcs.ReaderInitParam(ignore_obj_size=False), +) + +analysis_option = lcs.AnalysisOption( + req_rate=True, # Keep basic request rate analysis + access_pattern=False, # Disable access pattern analysis + size=True, # Keep size analysis + reuse=False, # Disable reuse analysis for small datasets + popularity=False, # Disable popularity analysis for small datasets (< 200 objects) + ttl=False, # Disable TTL analysis + popularity_decay=False, # Disable popularity decay analysis + lifetime=False, # Disable lifetime analysis + create_future_reuse_ccdf=False, # Disable experimental features + prob_at_age=False, # Disable experimental features + size_change=False, # Disable size change analysis +) + +analysis_param = lcs.AnalysisParam() + +analyzer = lcs.TraceAnalyzer(reader, "example_analysis", analysis_option=analysis_option, analysis_param=analysis_param) + +analyzer.run() diff --git a/libcachesim/cache.py b/libcachesim/cache.py index b61a512..94087e9 100644 --- a/libcachesim/cache.py +++ b/libcachesim/cache.py @@ -284,6 +284,7 @@ def __init__( def insert(self, req: Request) -> Optional[CacheObject]: return super().insert(req) + class TwoQ(CacheBase): """2Q replacement algorithm @@ -454,18 +455,24 @@ def __init__( class LRUProb(CacheBase): """LRU with Probabilistic Replacement - + Special parameters: prob: probability of promoting an object to the head of the queue (default: 0.5) """ def __init__( - self, cache_size: int, default_ttl: int = 86400 * 300, hashpower: int = 24, consider_obj_metadata: bool = False, + self, + cache_size: int, + default_ttl: int = 86400 * 300, + hashpower: int = 24, + consider_obj_metadata: bool = False, prob: float = 0.5, ): cache_specific_params = f"prob={prob}" super().__init__( - _cache=LRU_Prob_init(_create_common_params(cache_size, default_ttl, hashpower, consider_obj_metadata), cache_specific_params) + _cache=LRU_Prob_init( + _create_common_params(cache_size, default_ttl, hashpower, consider_obj_metadata), cache_specific_params + ) ) @@ -551,7 +558,9 @@ def __init__( try: from .libcachesim_python import ThreeLCache_init except ImportError: - raise ImportError("ThreeLCache is not installed. Please install it with `pip install libcachesim[all]`") + raise ImportError( + 'ThreeLCache is not installed. Please install it with `CMAKE_ARGS="-DENABLE_3L_CACHE=ON" pip install libcachesim --force-reinstall`' + ) cache_specific_params = f"objective={objective}" super().__init__( @@ -592,7 +601,9 @@ def __init__( try: from .libcachesim_python import GLCache_init except ImportError: - raise ImportError("GLCache is not installed. Please install it with `pip install libcachesim[all]`") + raise ImportError( + 'GLCache is not installed. Please install it with `CMAKE_ARGS="-DENABLE_GLCACHE=ON" pip install libcachesim --force-reinstall`' + ) cache_specific_params = f"segment-size={segment_size}, n-merge={n_merge}, type={type}, rank-intvl={rank_intvl}, merge-consecutive-segs={merge_consecutive_segs}, train-source-y={train_source_y}, retrain-intvl={retrain_intvl}" super().__init__( @@ -621,7 +632,9 @@ def __init__( try: from .libcachesim_python import LRB_init except ImportError: - raise ImportError("LRB is not installed. Please install it with `pip install libcachesim[all]`") + raise ImportError( + 'LRB is not installed. Please install it with `CMAKE_ARGS="-DENABLE_LRB=ON" pip install libcachesim --force-reinstall`' + ) cache_specific_params = f"objective={objective}" super().__init__( diff --git a/libcachesim/synthetic_reader.py b/libcachesim/synthetic_reader.py index b429242..936f29d 100644 --- a/libcachesim/synthetic_reader.py +++ b/libcachesim/synthetic_reader.py @@ -90,7 +90,7 @@ def read_one_req(self) -> Request: req = Request() if self.current_pos >= self.num_of_req: req.valid = False - return req # return invalid request + return req # return invalid request obj_id = self.obj_ids[self.current_pos] req.obj_id = obj_id diff --git a/libcachesim/trace_reader.py b/libcachesim/trace_reader.py index 20a2aba..d282a68 100644 --- a/libcachesim/trace_reader.py +++ b/libcachesim/trace_reader.py @@ -169,7 +169,7 @@ def get_num_of_req(self) -> int: def read_one_req(self) -> Request: req = Request() - ret = self._reader.read_one_req(req) # return 0 if success + ret = self._reader.read_one_req(req) # return 0 if success if ret != 0: raise RuntimeError("Failed to read one request") return req diff --git a/pyproject.toml b/pyproject.toml index 3618995..d71659c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,7 @@ build-backend = "scikit_build_core.build" [project] name = "libcachesim" -version = "0.3.3" +version = "0.3.3.post2" description="Python bindings for libCacheSim" readme = "README.md" requires-python = ">=3.9" @@ -30,29 +30,14 @@ dependencies = [ "pytest>=8.4.1", ] + [project.optional-dependencies] test = ["pytest"] -dev = [ - "pytest", - "pre-commit", - "ruff>=0.7.0", - "mypy>=1.0.0", -] -all = [ - "xgboost", - "lightgbm" -] +dev = ["pytest", "pre-commit", "ruff>=0.7.0", "mypy>=1.0.0"] - -[tool.scikit-build] -wheel.expand-macos-universal-tags = true -build-dir = "build" -cmake.build-type = "Release" -cmake.args = ["-G", "Ninja"] -cmake.define = { CMAKE_OSX_DEPLOYMENT_TARGET = "14.0" } -cmake.version = ">=3.15" -cmake.source-dir = "." -install.strip = false +# ============================================================ +# pytest +# ============================================================ [tool.pytest.ini_options] minversion = "8.0" @@ -71,6 +56,23 @@ python_files = ["test.py", "test_*.py", "*_test.py"] python_classes = ["Test*"] python_functions = ["test_*"] +# ============================================================ +# scikit-build +# ============================================================ + +[tool.scikit-build] +build-dir = "build" + +[tool.scikit-build.cmake] +build-type = "Release" +args = ["-G", "Ninja"] +define = { CMAKE_OSX_DEPLOYMENT_TARGET = "14.0" } +version = ">=3.15" +source-dir = "." + +[tool.scikit-build.install] +strip = false + [tool.cibuildwheel] manylinux-x86_64-image = "quay.io/pypa/manylinux_2_34_x86_64" @@ -80,10 +82,11 @@ build = ["cp39-*", "cp310-*", "cp311-*", "cp312-*", "cp313-*"] skip = ["*-win32", "*-manylinux_i686", "*-musllinux*", "pp*"] # Set the environment variable for the wheel build step. -environment = { LCS_BUILD_DIR = "{project}/src/libCacheSim/build", MACOSX_DEPLOYMENT_TARGET = "14.0" } +# NOTE(haocheng): we enable all the optional features for the wheel build. +environment = { LCS_BUILD_DIR = "{project}/src/libCacheSim/build", MACOSX_DEPLOYMENT_TARGET = "14.0", CMAKE_ARGS = "-DENABLE_3L_CACHE=ON -DENABLE_GLCACHE=ON -DENABLE_LRB=ON" } # Test that the wheel can be imported -test-command = "python -c 'import libcachesim; print(\"Import successful\")'" +test-command = "python -c 'import libcachesim; print(\"Import successful\")'; cp -r {project}/tests .; python -m pytest tests/ -v -m 'not optional'; python -m pytest tests/ -v -m 'optional'" [tool.cibuildwheel.linux] before-all = "yum install -y yum-utils && yum-config-manager --set-enabled crb && yum install -y git && git submodule update --init --recursive && bash scripts/install_deps.sh" diff --git a/scripts/detect_deps.py b/scripts/detect_deps.py index ab66642..5ef26a7 100644 --- a/scripts/detect_deps.py +++ b/scripts/detect_deps.py @@ -9,11 +9,13 @@ import sys import subprocess + def fix_pybind11(): """Fix pybind11 installation""" print("Checking pybind11 installation...") try: import pybind11 + print("✓ pybind11 is installed") # Check CMake config try: @@ -29,6 +31,7 @@ def fix_pybind11(): subprocess.run([sys.executable, "-m", "pip", "install", "--force-reinstall", "pybind11"], check=True) print("✓ pybind11 reinstalled successfully") import pybind11 + cmake_dir = pybind11.get_cmake_dir() print(f"✓ pybind11 CMake directory: {cmake_dir}") return True @@ -36,25 +39,28 @@ def fix_pybind11(): print(f"✗ pybind11 installation failed: {e}") return False + def fix_xgboost(): """Fix xgboost installation""" print("Checking xgboost installation...") try: import xgboost + print("✓ xgboost is installed") # Try to find CMake directory (if available) - cmake_dir = getattr(xgboost, 'cmake_dir', None) + cmake_dir = getattr(xgboost, "cmake_dir", None) if cmake_dir: print(f"✓ xgboost CMake directory: {cmake_dir}") else: # Try common install locations import os + possible_dirs = [ - os.path.join(xgboost.__path__[0], 'cmake'), - os.path.join(xgboost.__path__[0], '..', 'cmake'), - '/usr/local/lib/cmake/xgboost', - '/usr/local/share/cmake/xgboost', - '/opt/homebrew/lib/cmake/xgboost', + os.path.join(xgboost.__path__[0], "cmake"), + os.path.join(xgboost.__path__[0], "..", "cmake"), + "/usr/local/lib/cmake/xgboost", + "/usr/local/share/cmake/xgboost", + "/opt/homebrew/lib/cmake/xgboost", ] found = False for d in possible_dirs: @@ -72,19 +78,21 @@ def fix_xgboost(): subprocess.run([sys.executable, "-m", "pip", "install", "--force-reinstall", "xgboost"], check=True) print("✓ xgboost reinstalled successfully") import xgboost + print("✓ xgboost is installed after reinstall") # Repeat CMake dir check after reinstall - cmake_dir = getattr(xgboost, 'cmake_dir', None) + cmake_dir = getattr(xgboost, "cmake_dir", None) if cmake_dir: print(f"✓ xgboost CMake directory: {cmake_dir}") else: import os + possible_dirs = [ - os.path.join(xgboost.__path__[0], 'cmake'), - os.path.join(xgboost.__path__[0], '..', 'cmake'), - '/usr/local/lib/cmake/xgboost', - '/usr/local/share/cmake/xgboost', - '/opt/homebrew/lib/cmake/xgboost', + os.path.join(xgboost.__path__[0], "cmake"), + os.path.join(xgboost.__path__[0], "..", "cmake"), + "/usr/local/lib/cmake/xgboost", + "/usr/local/share/cmake/xgboost", + "/opt/homebrew/lib/cmake/xgboost", ] found = False for d in possible_dirs: @@ -99,24 +107,27 @@ def fix_xgboost(): print(f"✗ xgboost installation failed: {e}") return False + def fix_lightgbm(): """Fix lightgbm installation""" print("Checking lightgbm installation...") try: import lightgbm + print("✓ lightgbm is installed") # Try to find CMake directory (if available) - cmake_dir = getattr(lightgbm, 'cmake_dir', None) + cmake_dir = getattr(lightgbm, "cmake_dir", None) if cmake_dir: print(f"✓ lightgbm CMake directory: {cmake_dir}") else: import os + possible_dirs = [ - os.path.join(lightgbm.__path__[0], 'cmake'), - os.path.join(lightgbm.__path__[0], '..', 'cmake'), - '/usr/local/lib/cmake/LightGBM', - '/usr/local/share/cmake/LightGBM', - '/opt/homebrew/lib/cmake/LightGBM', + os.path.join(lightgbm.__path__[0], "cmake"), + os.path.join(lightgbm.__path__[0], "..", "cmake"), + "/usr/local/lib/cmake/LightGBM", + "/usr/local/share/cmake/LightGBM", + "/opt/homebrew/lib/cmake/LightGBM", ] found = False for d in possible_dirs: @@ -134,19 +145,21 @@ def fix_lightgbm(): subprocess.run([sys.executable, "-m", "pip", "install", "--force-reinstall", "lightgbm"], check=True) print("✓ lightgbm reinstalled successfully") import lightgbm + print("✓ lightgbm is installed after reinstall") # Repeat CMake dir check after reinstall - cmake_dir = getattr(lightgbm, 'cmake_dir', None) + cmake_dir = getattr(lightgbm, "cmake_dir", None) if cmake_dir: print(f"✓ lightgbm CMake directory: {cmake_dir}") else: import os + possible_dirs = [ - os.path.join(lightgbm.__path__[0], 'cmake'), - os.path.join(lightgbm.__path__[0], '..', 'cmake'), - '/usr/local/lib/cmake/LightGBM', - '/usr/local/share/cmake/LightGBM', - '/opt/homebrew/lib/cmake/LightGBM', + os.path.join(lightgbm.__path__[0], "cmake"), + os.path.join(lightgbm.__path__[0], "..", "cmake"), + "/usr/local/lib/cmake/LightGBM", + "/usr/local/share/cmake/LightGBM", + "/opt/homebrew/lib/cmake/LightGBM", ] found = False for d in possible_dirs: @@ -161,6 +174,7 @@ def fix_lightgbm(): print(f"✗ lightgbm installation failed: {e}") return False + def detect_dependencies(): """Detect dependencies for the project""" print("Detecting dependencies...") @@ -170,5 +184,6 @@ def detect_dependencies(): fix_xgboost() fix_lightgbm() + if __name__ == "__main__": - detect_dependencies() \ No newline at end of file + detect_dependencies() diff --git a/scripts/smart_build.py b/scripts/smart_build.py index 0efb783..871845f 100644 --- a/scripts/smart_build.py +++ b/scripts/smart_build.py @@ -9,17 +9,17 @@ import os import platform + def get_macos_deployment_target(): """Get appropriate macOS deployment target""" if sys.platform != "darwin": return None - + try: - result = subprocess.run(["sw_vers", "-productVersion"], - capture_output=True, text=True, check=True) + result = subprocess.run(["sw_vers", "-productVersion"], capture_output=True, text=True, check=True) macos_version = result.stdout.strip() - major_version = macos_version.split('.')[0] - + major_version = macos_version.split(".")[0] + # Set deployment target to current version deployment_target = f"{major_version}.0" print(f"Detected macOS version: {macos_version}, set deployment target: {deployment_target}") @@ -28,6 +28,7 @@ def get_macos_deployment_target(): print(f"Failed to detect macOS version, using default: {e}") return "14.0" + def check_dependency(module_name): """Check if a Python module is installed""" try: @@ -36,77 +37,81 @@ def check_dependency(module_name): except ImportError: return False + def fix_pybind11(): """Fix pybind11 installation""" print("Checking pybind11...") subprocess.run([sys.executable, "scripts/fix_pybind11.py"], check=True) + def build_with_flags(): """Build according to dependencies""" # Fix pybind11 fix_pybind11() - + # Check ML dependencies xgboost_available = check_dependency("xgboost") lightgbm_available = check_dependency("lightgbm") - + print(f"XGBoost available: {xgboost_available}") print(f"LightGBM available: {lightgbm_available}") - + # Build CMake args cmake_args = ["-G", "Ninja"] - + # Add pybind11 path try: import pybind11 + pybind11_dir = pybind11.get_cmake_dir() cmake_args.extend([f"-Dpybind11_DIR={pybind11_dir}"]) print(f"Set pybind11 path: {pybind11_dir}") except Exception as e: print(f"Warning: failed to set pybind11 path: {e}") - + # Enable GLCache if XGBoost is available if xgboost_available: cmake_args.extend(["-DENABLE_GLCACHE=ON"]) print("Enable GLCache (requires XGBoost)") - + # Enable LRB and 3LCache if LightGBM is available if lightgbm_available: cmake_args.extend(["-DENABLE_LRB=ON", "-DENABLE_3L_CACHE=ON"]) print("Enable LRB and 3LCache (requires LightGBM)") - + # Set macOS deployment target deployment_target = get_macos_deployment_target() if deployment_target: cmake_args.extend([f"-DCMAKE_OSX_DEPLOYMENT_TARGET={deployment_target}"]) - + # Build commands build_dir = "src/libCacheSim/build" source_dir = "." - + # Clean build directory if os.path.exists(build_dir): print("Cleaning build directory...") subprocess.run(["rm", "-rf", build_dir], check=True) - + # Run CMake configure cmake_cmd = ["cmake", "-S", source_dir, "-B", build_dir] + cmake_args print(f"Running: {' '.join(cmake_cmd)}") subprocess.run(cmake_cmd, check=True) - + # Run build build_cmd = ["cmake", "--build", build_dir] print(f"Running: {' '.join(build_cmd)}") subprocess.run(build_cmd, check=True) - + print("✓ Build completed!") + def main(): print("=== libCacheSim Smart Build ===") print(f"Platform: {platform.platform()}") print(f"Python: {sys.version}") print() - + try: build_with_flags() except subprocess.CalledProcessError as e: @@ -116,5 +121,6 @@ def main(): print(f"✗ Build exception: {e}") sys.exit(1) + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/tests/reference.csv b/tests/reference.csv deleted file mode 100644 index cb569d0..0000000 --- a/tests/reference.csv +++ /dev/null @@ -1,20 +0,0 @@ -FIFO,0.01,0.8368 -ARC,0.01,0.8222 -Clock,0.01,0.8328 -LRB,0.01,0.8339 -LRU,0.01,0.8339 -S3FIFO,0.01,0.8235 -Sieve,0.01,0.8231 -3LCache,0.01,0.8339 -TinyLFU,0.01,0.8262 -TwoQ,0.01,0.8276 -FIFO,0.1,0.8075 -ARC,0.1,0.7688 -Clock,0.1,0.8086 -LRB,0.1,0.8097 -LRU,0.1,0.8097 -S3FIFO,0.1,0.7542 -Sieve,0.1,0.7903 -3LCache,0.1,0.8097 -TinyLFU,0.1,0.7666 -TwoQ,0.1,0.7695 diff --git a/tests/test_cache.py b/tests/test_cache.py index 108e6fd..c339b91 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -9,40 +9,57 @@ import os from libcachesim import ( # Basic algorithms - LRU, FIFO, LFU, ARC, Clock, Random, + LRU, + FIFO, + LFU, + ARC, + Clock, + Random, # Advanced algorithms - S3FIFO, Sieve, LIRS, TwoQ, SLRU, WTinyLFU, + S3FIFO, + Sieve, + LIRS, + TwoQ, + SLRU, + WTinyLFU, # Request and other utilities - Request, ReqOp, SyntheticReader + Request, + ReqOp, + SyntheticReader, ) # Try to import optional algorithms that might not be available try: from libcachesim import LeCaR, LFUDA, ClockPro, Cacheus + OPTIONAL_ALGORITHMS = [LeCaR, LFUDA, ClockPro, Cacheus] except ImportError: OPTIONAL_ALGORITHMS = [] try: from libcachesim import Belady, BeladySize + OPTIMAL_ALGORITHMS = [Belady, BeladySize] except ImportError: OPTIMAL_ALGORITHMS = [] try: from libcachesim import LRUProb, FlashProb + PROBABILISTIC_ALGORITHMS = [LRUProb, FlashProb] except ImportError: PROBABILISTIC_ALGORITHMS = [] try: from libcachesim import Size, GDSF + SIZE_BASED_ALGORITHMS = [Size, GDSF] except ImportError: SIZE_BASED_ALGORITHMS = [] try: from libcachesim import Hyperbolic + HYPERBOLIC_ALGORITHMS = [Hyperbolic] except ImportError: HYPERBOLIC_ALGORITHMS = [] @@ -51,43 +68,63 @@ class TestCacheBasicFunctionality: """Test basic cache functionality across different algorithms""" - @pytest.mark.parametrize("cache_class", [ - LRU, FIFO, LFU, ARC, Clock, Random, - S3FIFO, Sieve, LIRS, TwoQ, SLRU, WTinyLFU, LeCaR, LFUDA, ClockPro, Cacheus, - LRUProb, FlashProb, Size, GDSF, Hyperbolic - ]) + @pytest.mark.parametrize( + "cache_class", + [ + LRU, + FIFO, + LFU, + ARC, + Clock, + Random, + S3FIFO, + Sieve, + LIRS, + TwoQ, + SLRU, + WTinyLFU, + LeCaR, + LFUDA, + ClockPro, + Cacheus, + LRUProb, + FlashProb, + Size, + GDSF, + Hyperbolic, + ], + ) def test_cache_initialization(self, cache_class): """Test that all cache types can be initialized with different sizes""" - cache_sizes = [1024, 1024*1024, 1024*1024*1024] # 1KB, 1MB, 1GB - + cache_sizes = [1024, 1024 * 1024, 1024 * 1024 * 1024] # 1KB, 1MB, 1GB + for size in cache_sizes: try: cache = cache_class(size) assert cache is not None - assert hasattr(cache, 'get') - assert hasattr(cache, 'insert') - assert hasattr(cache, 'find') + assert hasattr(cache, "get") + assert hasattr(cache, "insert") + assert hasattr(cache, "find") except Exception as e: pytest.skip(f"Cache {cache_class.__name__} failed to initialize: {e}") - @pytest.mark.parametrize("cache_class", [ - LRU, FIFO, LFU, ARC, Clock, Random, - S3FIFO, Sieve, LIRS, TwoQ, SLRU, WTinyLFU - ]) + @pytest.mark.parametrize( + "cache_class", [LRU, FIFO, LFU, ARC, Clock, Random, S3FIFO, Sieve, LIRS, TwoQ, SLRU, WTinyLFU] + ) def test_basic_get_and_insert(self, cache_class): """Test basic get and insert operations""" - cache = cache_class(1024*1024) # 1MB cache - + cache = cache_class(1024 * 1024) # 1MB cache + # Create a request req = Request() req.obj_id = 1 req.obj_size = 100 req.op = ReqOp.OP_GET - + # Initially, object should not be in cache hit = cache.get(req) assert hit == False - + # Insert the object if cache_class != LIRS: cache_obj = cache.insert(req) @@ -96,20 +133,41 @@ def test_basic_get_and_insert(self, cache_class): assert cache_obj.obj_size == 100 else: assert cache.insert(req) is None - + # Now it should be a hit hit = cache.get(req) assert hit == True - @pytest.mark.parametrize("cache_class", [ - LRU, FIFO, LFU, ARC, Clock, Random, - S3FIFO, Sieve, LIRS, TwoQ, SLRU, WTinyLFU, LeCaR, LFUDA, ClockPro, Cacheus, - LRUProb, FlashProb, Size, GDSF, Hyperbolic - ]) + @pytest.mark.parametrize( + "cache_class", + [ + LRU, + FIFO, + LFU, + ARC, + Clock, + Random, + S3FIFO, + Sieve, + LIRS, + TwoQ, + SLRU, + WTinyLFU, + LeCaR, + LFUDA, + ClockPro, + Cacheus, + LRUProb, + FlashProb, + Size, + GDSF, + Hyperbolic, + ], + ) def test_cache_eviction(self, cache_class): """Test that cache eviction works when cache is full""" - cache = cache_class(1024*1024) # 1MB cache - + cache = cache_class(1024 * 1024) # 1MB cache + if cache_class == GDSF: pytest.skip("GDSF should be used with find/get but not insert") @@ -120,9 +178,9 @@ def test_cache_eviction(self, cache_class): req.obj_size = 50 # Each object is 50 bytes req.op = ReqOp.OP_GET req.next_access_vtime = 100 + i - + cache.insert(req) - + # Try to insert one more object req = Request() req.obj_id = 999 @@ -131,59 +189,101 @@ def test_cache_eviction(self, cache_class): req.op = ReqOp.OP_GET cache.insert(req) - @pytest.mark.parametrize("cache_class", [ - LRU, FIFO, LFU, ARC, Clock, Random, - S3FIFO, Sieve, LIRS, TwoQ, SLRU, WTinyLFU, LeCaR, LFUDA, ClockPro, Cacheus, - LRUProb, FlashProb, Size, GDSF, Hyperbolic - ]) + @pytest.mark.parametrize( + "cache_class", + [ + LRU, + FIFO, + LFU, + ARC, + Clock, + Random, + S3FIFO, + Sieve, + LIRS, + TwoQ, + SLRU, + WTinyLFU, + LeCaR, + LFUDA, + ClockPro, + Cacheus, + LRUProb, + FlashProb, + Size, + GDSF, + Hyperbolic, + ], + ) def test_cache_find_method(self, cache_class): """Test the find method functionality""" cache = cache_class(1024) - + req = Request() req.obj_id = 1 req.obj_size = 100 req.op = ReqOp.OP_GET - + # Initially should not find the object cache_obj = cache.find(req, update_cache=False) assert cache_obj is None - + # Insert the object cache.insert(req) - + # Now should find it cache_obj = cache.find(req, update_cache=False) assert cache_obj is not None assert cache_obj.obj_id == 1 - @pytest.mark.parametrize("cache_class", [ - LRU, FIFO, LFU, ARC, Clock, Random, - S3FIFO, Sieve, LIRS, TwoQ, SLRU, WTinyLFU, LeCaR, LFUDA, ClockPro, Cacheus, - LRUProb, FlashProb, Size, GDSF, Hyperbolic - ]) + @pytest.mark.parametrize( + "cache_class", + [ + LRU, + FIFO, + LFU, + ARC, + Clock, + Random, + S3FIFO, + Sieve, + LIRS, + TwoQ, + SLRU, + WTinyLFU, + LeCaR, + LFUDA, + ClockPro, + Cacheus, + LRUProb, + FlashProb, + Size, + GDSF, + Hyperbolic, + ], + ) def test_cache_can_insert(self, cache_class): """Test can_insert method""" - cache = cache_class(1024*1024) - + cache = cache_class(1024 * 1024) + req = Request() req.obj_id = 1 req.obj_size = 100 req.op = ReqOp.OP_GET - + # Should be able to insert initially can_insert = cache.can_insert(req) assert can_insert == True - + # Insert the object cache.insert(req) - + # Try to insert a larger object that won't fit req2 = Request() req2.obj_id = 2 req2.obj_size = 150 # Too large for remaining space req2.op = ReqOp.OP_GET - + can_insert = cache.can_insert(req2) # Some algorithms might still return True if they can evict assert can_insert in [True, False] @@ -195,12 +295,12 @@ class TestCacheEdgeCases: def test_zero_size_cache(self): """Test cache with zero size""" cache = LRU(0) - + req = Request() req.obj_id = 1 req.obj_size = 100 req.op = ReqOp.OP_GET - + # Should not be able to insert can_insert = cache.can_insert(req) assert can_insert == False @@ -208,12 +308,12 @@ def test_zero_size_cache(self): def test_large_object(self): """Test inserting object larger than cache size""" cache = LRU(100) - + req = Request() req.obj_id = 1 req.obj_size = 200 # Larger than cache req.op = ReqOp.OP_GET - + # Should not be able to insert can_insert = cache.can_insert(req) assert can_insert == False @@ -227,12 +327,12 @@ def test_string_object_id(self): def test_zero_size_object(self): """Test with zero size object""" cache = LRU(1024) - + req = Request() req.obj_id = 1 req.obj_size = 0 req.op = ReqOp.OP_GET - + # Should work fine cache.insert(req) hit = cache.get(req) @@ -245,46 +345,33 @@ class TestCacheWithSyntheticTrace: def test_cache_with_zipf_trace(self): """Test cache performance with Zipf distribution""" # Create synthetic reader with Zipf distribution - reader = SyntheticReader( - num_of_req=1000, - obj_size=100, - alpha=1.0, - dist="zipf", - num_objects=100, - seed=42 - ) - + reader = SyntheticReader(num_of_req=1000, obj_size=100, alpha=1.0, dist="zipf", num_objects=100, seed=42) + # Test with different cache algorithms cache_algorithms = [LRU, FIFO, LFU, S3FIFO, Sieve] - + for cache_class in cache_algorithms: cache = cache_class(1024) # 1KB cache - + # Process the trace miss_ratio, _ = cache.process_trace(reader) - + # Basic sanity checks assert 0.0 <= miss_ratio <= 1.0 - + # Reset reader for next test reader.reset() def test_cache_with_uniform_trace(self): """Test cache performance with uniform distribution""" # Create synthetic reader with uniform distribution - reader = SyntheticReader( - num_of_req=500, - obj_size=50, - dist="uniform", - num_objects=50, - seed=123 - ) - + reader = SyntheticReader(num_of_req=500, obj_size=50, dist="uniform", num_objects=50, seed=123) + cache = LRU(512) # 512B cache - + # Process the trace miss_ratio, _ = cache.process_trace(reader) - + # Basic sanity checks assert 0.0 <= miss_ratio <= 1.0 @@ -295,18 +382,18 @@ class TestCacheStatistics: def test_cache_occupied_bytes(self): """Test get_occupied_byte method""" cache = LRU(1024) - + # Initially should be 0 occupied = cache.get_occupied_byte() assert occupied == 0 - + # Insert an object req = Request() req.obj_id = 1 req.obj_size = 100 req.op = ReqOp.OP_GET cache.insert(req) - + # Should reflect the inserted object size occupied = cache.get_occupied_byte() assert occupied >= 100 # May include metadata overhead @@ -314,11 +401,11 @@ def test_cache_occupied_bytes(self): def test_cache_object_count(self): """Test get_n_obj method""" cache = LRU(1024) - + # Initially should be 0 n_obj = cache.get_n_obj() assert n_obj == 0 - + # Insert objects for i in range(3): req = Request() @@ -326,7 +413,7 @@ def test_cache_object_count(self): req.obj_size = 100 req.op = ReqOp.OP_GET cache.insert(req) - + # Should have 3 objects n_obj = cache.get_n_obj() assert n_obj == 3 @@ -334,14 +421,14 @@ def test_cache_object_count(self): def test_cache_print(self): """Test print_cache method""" cache = LRU(1024) - + # Insert an object req = Request() req.obj_id = 1 req.obj_size = 100 req.op = ReqOp.OP_GET cache.insert(req) - + # Should return a string representation cache.print_cache() @@ -352,22 +439,22 @@ class TestCacheOperations: def test_cache_remove(self): """Test remove method""" cache = LRU(1024) - + # Insert an object req = Request() req.obj_id = 1 req.obj_size = 100 req.op = ReqOp.OP_GET cache.insert(req) - + # Verify it's in cache hit = cache.get(req) assert hit == True - + # Remove it removed = cache.remove(1) assert removed == True - + # Verify it's no longer in cache hit = cache.get(req) assert hit == False @@ -375,7 +462,7 @@ def test_cache_remove(self): def test_cache_need_eviction(self): """Test need_eviction method""" cache = LRU(200) - + # Insert objects until cache is nearly full for i in range(3): req = Request() @@ -383,13 +470,13 @@ def test_cache_need_eviction(self): req.obj_size = 50 req.op = ReqOp.OP_GET cache.insert(req) - + # Try to insert a larger object req = Request() req.obj_id = 999 req.obj_size = 100 req.op = ReqOp.OP_GET - + # Should need eviction need_eviction = cache.need_eviction(req) assert need_eviction == True @@ -397,7 +484,7 @@ def test_cache_need_eviction(self): def test_cache_to_evict(self): """Test to_evict method""" cache = LRU(200) - + # Insert objects for i in range(3): req = Request() @@ -405,14 +492,42 @@ def test_cache_to_evict(self): req.obj_size = 50 req.op = ReqOp.OP_GET cache.insert(req) - + # Try to insert a larger object req = Request() req.obj_id = 999 req.obj_size = 100 req.op = ReqOp.OP_GET - + # Should return an object to evict evict_obj = cache.to_evict(req) assert evict_obj is not None - assert hasattr(evict_obj, 'obj_id') \ No newline at end of file + assert hasattr(evict_obj, "obj_id") + + +class TestCacheOptionalAlgorithms: + """Test optional algorithms""" + + @pytest.mark.optional + def test_glcache(self): + """Test GLCache algorithm""" + from libcachesim import GLCache + + cache = GLCache(1024) + assert cache is not None + + @pytest.mark.optional + def test_lrb(self): + """Test LRB algorithm""" + from libcachesim import LRB + + cache = LRB(1024) + assert cache is not None + + @pytest.mark.optional + def test_3lcache(self): + """Test 3LCache algorithm""" + from libcachesim import ThreeLCache + + cache = ThreeLCache(1024) + assert cache is not None