commit 18ca54544051175eb44ffd077ba761fb56d1380e Author: Stan Date: Sat Jun 29 01:01:51 2024 -0500 main push to repo diff --git a/.dockerignore b/.dockerignore new file mode 100755 index 0000000..0b1e1e7 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,27 @@ +**/__pycache__ +**/.venv +**/.classpath +**/.dockerignore +**/.env +**/.git +**/.gitignore +**/.project +**/.settings +**/.toolstarget +**/.vs +**/.vscode +**/*.*proj.user +**/*.dbmdl +**/*.jfm +**/bin +**/charts +**/docker-compose* +**/compose* +**/Dockerfile* +**/node_modules +**/npm-debug.log +**/obj +**/secrets.dev.yaml +**/values.dev.yaml +LICENSE +README.md diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100755 index 0000000..f3d8430 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,19 @@ +{ + "configurations": [ + { + "name": "Docker: Python - General", + "type": "docker", + "request": "launch", + "preLaunchTask": "docker-run: debug", + "python": { + "pathMappings": [ + { + "localRoot": "${workspaceFolder}", + "remoteRoot": "/app" + } + ], + "projectType": "general" + } + } + ] +} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..39c0c4f --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "github.gitAuthentication": false +} \ No newline at end of file diff --git a/.vscode/tasks.json b/.vscode/tasks.json new file mode 100755 index 0000000..37fc95f --- /dev/null +++ b/.vscode/tasks.json @@ -0,0 +1,26 @@ +{ + "version": "2.0.0", + "tasks": [ + { + "type": "docker-build", + "label": "docker-build", + "platform": "python", + "dockerBuild": { + "tag": "fbrowser:latest", + "dockerfile": "${workspaceFolder}/Dockerfile", + "context": "${workspaceFolder}", + "pull": true + } + }, + { + "type": "docker-run", + "label": "docker-run: debug", + "dependsOn": [ + "docker-build" + ], + "python": { + "file": "Fbrowser.py" + } + } + ] +} \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100755 index 0000000..109f18f --- /dev/null +++ b/Dockerfile @@ -0,0 +1,23 @@ +# For more information, please refer to https://aka.ms/vscode-docker-python +FROM python:3-slim + +# Keeps Python from generating .pyc files in the container +ENV PYTHONDONTWRITEBYTECODE=1 + +# Turns off buffering for easier container logging +ENV PYTHONUNBUFFERED=1 + +# Install pip requirements +COPY requirements.txt . +RUN python -m pip install -r requirements.txt + +WORKDIR /app +COPY . /app + +# Creates a non-root user with an explicit UID and adds permission to access the /app folder +# For more info, please refer to https://aka.ms/vscode-docker-python-configure-containers +RUN adduser -u 5678 --disabled-password --gecos "" appuser && chown -R appuser /app +USER appuser + +# During debugging, this entry point will be overridden. For more information, please refer to https://aka.ms/vscode-docker-python-debug +CMD ["python", "Fbrowser.py"] diff --git a/Fbrowser.py b/Fbrowser.py new file mode 100755 index 0000000..0f87ef5 --- /dev/null +++ b/Fbrowser.py @@ -0,0 +1,273 @@ +# Path: Fbrowser.py +# Sample Music Browser & Ogranizer: Main.py + +# Importing Libraries +import sys +import os + +from ScanOrg import organizer, file_scanner, DirectoryFilterProxyModel, FileFilterProxyModel +from stanzip import Extractor as extractor +from stanzip import Compressor as compressor +from stanzip import zipfile, py7zr, rarfile +from PyQt5.QtGui import QStandardItem , QStandardItemModel, QContextMenuEvent +from PyQt5.QtWidgets import QApplication, QLabel, QPushButton, QVBoxLayout, QMenu, QTreeView, QMessageBox, QSlider, QWidget, QFileSystemModel, QSplitter, QHBoxLayout, QFileDialog +from PyQt5.QtMultimedia import QMediaPlaylist, QMediaPlayer, QMediaContent, QAudioFormat, QAudioDeviceInfo, QAudio +from PyQt5.QtCore import QDir, QSortFilterProxyModel, Qt, QUrl +from matplotlib.backends.backend_qt5agg import FigureCanvasQTAgg as MPLCanvas +from matplotlib.figure import Figure + + + +# Sample Music Browser Main Class +class SampleMusicBrowser(QWidget): + def __init__(self): + super().__init__() + self.organizer = organizer() + self.extractor = extractor() + self.file_model = QStandardItemModel() + self.player = QMediaPlayer() + self.playlist = QMediaPlaylist() + self.player.setPlaylist(self.playlist) + self.tree_model = QFileSystemModel() + + self.init_ui() + #self.midi_player = MidPlay() + self.folder_contents_view.setEditTriggers(QTreeView.NoEditTriggers) + self.player.error.connect(self.player_error) + self.player.mediaStatusChanged.connect(self.player_media_status_changed) + self.player.setAudioRole(QAudio.MusicRole) + self.layout = QHBoxLayout() + self.canvas = MPLCanvas() + self.layout.addWidget(self.canvas) + self.setLayout(self.layout) + + # Player Error Debugging + def player_error(self, error): + try: + if error == QMediaPlayer.NoError: + return + print(f"An error occurred: Code:{error} {self.player.errorString()}") + except Exception as e: + print(f"Error: {e}") + + # Media Status Changed Debugging + def player_media_status_changed(self, status): + if status == QMediaPlayer.NoMedia: + return + print('Media Status: ' + str(status)) + + + def on_extract_button_clicked(self): + extraction_directory = QFileDialog.getExistingDirectory(self, "Select Extraction Directory") + if extraction_directory: + index = self.folder_contents_view.currentIndex() + if index.isValid(): + self.extractor.zipviewer(index, self.file_filter_model, self.list_model, extraction_directory) + + def show_context_menu(self, position): + menu = QMenu(self) + extract_action = menu.addAction('Extract') + extract_action.triggered.connect(self.on_extract_button_clicked) # Connect to the extraction function + menu.exec(self.folder_contents_view.mapToGlobal(position)) + + def init_ui(self): + layout = QVBoxLayout() + label = QLabel('Sample Music Browser') + buttons_layout = QHBoxLayout() + layout.addWidget(label) + + + #self.midi_player = MidPlay() + self.file_tree = QTreeView() + + self.file_tree.setHeaderHidden(True) + self.file_tree.clicked.connect(self.change_directory) + + + play_button = QPushButton('Play') + play_button.clicked.connect(self.player.play) + #play_button.clicked.connect(self.midi_player.play_midi) + buttons_layout.addWidget(play_button) + + stop_button = QPushButton('Stop') + stop_button.clicked.connect(self.player.stop) + # stop_button.clicked.connect(self.midi_player.stop) + buttons_layout.addWidget(stop_button) + self.player.stateChanged.connect(self.player_state_changed) + self.player.positionChanged.connect(self.player_position_changed) + self.player.durationChanged.connect(self.player_duration_changed) + + + layout.addLayout(buttons_layout) + + self.folder_contents_view = QTreeView() + self.folder_contents_view.setHeaderHidden(False) + self.folder_contents_view.setRootIsDecorated(False) + self.folder_contents_view.setSortingEnabled(True) + + splitter = QSplitter() + splitter.addWidget(self.file_tree) + splitter.addWidget(self.folder_contents_view) + layout.addWidget(splitter) + self.current_dir_label = QLabel() + layout.addWidget(self.current_dir_label) + + up_dir_button = QPushButton('Up Directory') + up_dir_button.clicked.connect(self.go_up_directory) + layout.addWidget(up_dir_button) + + forward_button = QPushButton('Forward') + forward_button.clicked.connect(self.go_forward_directory) + layout.addWidget(forward_button) + self.setLayout(layout) + + self.setWindowTitle('Samples are life!') + path = QFileDialog.getExistingDirectory(self, 'Select Directory') + if path: + self.populate_file_tree(path) + + + self.player.setVolume(50) + volume_slider = QSlider(Qt.Horizontal) + volume_slider.setRange(0, 100) + volume_slider.setValue(50) + volume_slider.valueChanged.connect(self.player.setVolume) + layout.addWidget(volume_slider) + self.playlist.currentIndexChanged.connect(self.playlist_current_index_changed) + self.playlist.currentMediaChanged.connect(self.playlist_current_media_changed) + self.playlist.mediaInserted.connect(self.playlist_media_inserted) + self.playlist.mediaRemoved.connect(self.playlist_media_removed) + self.playlist.setPlaybackMode(QMediaPlaylist.Loop) + self.folder_contents_view.doubleClicked.connect(self.play_file) + self.folder_contents_view.setContextMenuPolicy(Qt.CustomContextMenu) + self.folder_contents_view.customContextMenuRequested.connect(self.show_context_menu) + + + def directory_loaded(self, path): + self.file_tree.setRootIndex(self.directory_model.mapFromSource(self.model.index(path))) + self.folder_contents_view.setRootIndex(self.file_filter_model.mapFromSource(self.list_model.index(path))) + + def populate_file_tree(self, path): + try: + self.tree_model.setRootPath(path) + self.file_tree.setModel(self.tree_model) + self.directory_model = DirectoryFilterProxyModel() + self.directory_model.setSourceModel(self.tree_model) + self.file_tree.setModel(self.directory_model) + self.file_tree.setRootIndex(self.directory_model.mapFromSource(self.tree_model.index(path))) + self.list_model = QFileSystemModel() + self.list_model.setRootPath(path) + self.file_filter_model = FileFilterProxyModel() + self.file_filter_model.setSourceModel(self.list_model) + self.folder_contents_view.setModel(self.file_filter_model) + self.folder_contents_view.setRootIndex(self.file_filter_model.mapFromSource(self.list_model.index(path))) + self.current_dir_label.setText(path) + except Exception as e: + print(f"Error Populating File Tree: {e}") + + def closeEvent(self, event): + reply = QMessageBox.question(self, 'Exit', 'Are you sure you want to exit?', + QMessageBox.Yes | QMessageBox.No, QMessageBox.No) + if reply == QMessageBox.Yes: + event.accept() + else: + event.ignore() + + def play_file(self, index): + try: + index = self.file_filter_model.mapToSource(index) + file_path = self.list_model.filePath(index) + if file_path.endswith(('.zip', '.rar', '.7z')): + with zipfile.ZipFile(file_path, 'r') as zip_ref: + for filename in zip_ref.namelist(): + if filename.lower().endswith(('mp3', 'wav', 'ogg', 'flac', + 'm4a', 'wma', 'aac', 'aiff', 'alac', + 'mid', 'midi', 'mp4', 'm4a')): + audo_file = zip_ref.extract(filename) + media = QMediaContent(QUrl.fromLocalFile(audo_file)) + self.playlist.clear() + self.playlist.addMedia(media) + self.player.play() + break + if os.path.exists(audo_file): + os.remove(audo_file) + + elif file_path.endswith(('.mid', '.midi')): + #self.midi_player = MidPlay() + #fig = self.midi_player.play_midi(file_path) + self.canvas.draw() + + else: + media = QMediaContent(QUrl.fromLocalFile(file_path)) + self.playlist.clear() + self.playlist.addMedia(media) + self.player.play() + except Exception as e: + print(f"Error Playing File: {e}") + + def player_state_changed(self, state): + if state == QMediaPlayer.StoppedState: + self.playlist.setCurrentIndex(0) + + def player_position_changed(self, position): + pass + def player_duration_changed(self, duration): + pass + def playlist_current_index_changed(self, index): + pass + def playlist_current_media_changed(self, media): + pass + def playlist_media_inserted(self, start, end): + pass + def playlist_media_removed(self, start, end): + pass + + def change_directory(self, index): + index = self.directory_model.mapToSource(index) + try: + file_path = self.tree_model.filePath(index) + self.list_model.setRootPath(file_path) + self.current_dir_label.setText(file_path) + self.folder_contents_view.setRootIndex(self.file_filter_model.mapFromSource(self.list_model.index(file_path))) + except Exception as e: + print(f"Error Changing Dirs.: {e}") + + def go_up_directory(self): + index = self.folder_contents_view.rootIndex() + index = self.file_filter_model.mapToSource(index) + parent_index = index.parent() + if parent_index.isValid(): # Check if the parent index is valid + self.folder_contents_view.setRootIndex(self.file_filter_model.mapFromSource(parent_index)) + self.current_dir_label.setText(self.list_model.filePath(parent_index)) + + def go_forward_directory(self): + index = self.folder_contents_view.rootIndex() + index = self.file_filter_model.mapToSource(index) + parent_index = index.parent() + if parent_index.isValid(): + self.folder_contents_view.setRootIndex(self.file_filter_model.mapFromSource(parent_index)) + self.current_dir_label.setText(self.list_model.filePath(parent_index)) + + +if __name__ == '__main__': + # player = MidPlay() + # file_path = list(player.select_file()) # Get the selected file path + + # viewer = MidViewer() + + # viewer.read_midi(file_path) + # viewer.view_midi() + #viewer.show() + # viewer.save('test.png') + # viewer.clear() + # viewer.close() + # print(viewer.get_midi_info(file_path)) # Use the file path + # print(viewer.get_piano_roll(file_path)) # Use the file path + # print(viewer.get_tempo(file_path)) # Use the file path + # print(viewer.get_notes(file_path)) # Use the file path + app = QApplication(sys.argv) + sampleMusicBrowser = SampleMusicBrowser() + sampleMusicBrowser.show() + + + sys.exit(app.exec_()) \ No newline at end of file diff --git a/MidPlay.py b/MidPlay.py new file mode 100755 index 0000000..19f8bd5 --- /dev/null +++ b/MidPlay.py @@ -0,0 +1,255 @@ +#Path: MidPlay.py +# Description: A class to play MIDI files and a class to view MIDI files +# probably switching to a different library for midi handling +# pretty_midi is not very good for this purpose or real-time playback of midi files + +"""Pretty Midi module type stubs are included but incomplete. +Pretty Midi comes with a statement to cite the following paper +when used in a research project: + +Colin Raffel and Daniel P. W. Ellis. Intuitive Analysis, +Creation and Manipulation of MIDI Data with pretty_midi. +In Proceedings of the 15th International Conference on Music +Information Retrieval Late Breaking and Demo Papers, 2014. + +colinraffel.com/publications/ismir2014intuitive.pdf + +""" +import pygame +# Imports +import pretty_midi +import fluidsynth +import sys +import os +from PyQt5.QtWidgets import (QApplication, QLabel, QListWidget, QFileDialog, QMessageBox, QWidget, QPushButton, QHBoxLayout, + QVBoxLayout, + QProgressBar, + QSlider) # structured for readability and to avoid long lines and it annoys my friend XD +from PyQt5.QtCore import QTimer, Qt +import threading +import cProfile # profiler remove for production + +pygame.mixer.init() +pygame.init() + +class MidPlayGUI(QWidget): + def __init__(self): + super().__init__() + self.player = MidPlay() + self.current_midi_label = QLabel() + self.playlist_widget = QListWidget() + self.setWindowTitle("MidPlay - Midi Player") + self.init_ui() + self.timer = QTimer() + self.timer.timeout.connect(self.handle_song_end) + self.timer.start(1000) + + def set_volume(self, value): + volume = value / 100 + pygame.mixer.music.set_volume(volume) + + def update_progress(self): + if self.player.current_midi: + current_time = pygame.mixer.music.get_pos() / 1000 # get_pos returns time in milliseconds NOT SECONDS! + total_time = self.player.current_midi.get_end_time() + progress = current_time / total_time * 100 + self.progress_bar.setValue(int(progress)) + + def handle_song_end(self): + if self.player.playing and not pygame.mixer.music.get_busy(): + self.player.next_song() + if self.player.playlist: + self.player.current_index %= len(self.player.playlist) + filepath = self.player.playlist[self.player.current_index] + filename = os.path.basename(filepath) + self.current_midi_label.setText(f"Current MIDI: {filename}") + self.update_progress() + + + def init_ui(self): + #label = QLabel("MidPlay - Midi player") + #label.setStyleSheet("font-size: 20px; font-weight: bold;") + + self.progress_bar = QProgressBar() + self.volume_slider = QSlider(Qt.Horizontal) + self.volume_slider.setMinimum(0) + self.volume_slider.setMaximum(100) + self.volume_slider.setValue(100) + self.volume_slider.valueChanged.connect(self.set_volume) + self.current_midi_label.setText("Current MIDI: None") + + + self.playlist_widget.itemDoubleClicked.connect(self.play_selected_song) + pygame.mixer.music.set_endevent(pygame.USEREVENT) + + # Buttons + play_button = QPushButton("Play") + play_button.clicked.connect(self.player.play_midi) + + pause_button = QPushButton("Pause") + pause_button.clicked.connect(self.player.pause) + + stop_button = QPushButton("Stop") + stop_button.clicked.connect(self.player.stop) + + next_button = QPushButton("Next") + next_button.clicked.connect(self.player.next_song) + + back_button = QPushButton("Back") + back_button.clicked.connect(self.previous_song) + + add_button = QPushButton("Add to Playlist") + add_button.clicked.connect(self.load_midi_file) + + add_folder_button = QPushButton("Add Folder to Playlist") + add_folder_button.clicked.connect(self.load_folder) + + clear_button = QPushButton("Clear Playlist") + clear_button.clicked.connect(self.clear_playlist) + + # Window layout + layout = QVBoxLayout() + layout.addWidget(self.current_midi_label) + layout.addWidget(self.playlist_widget) + layout.addWidget(self.progress_bar) + layout.addWidget(self.volume_slider) + layout.addWidget(play_button) + layout.addWidget(pause_button) + layout.addWidget(stop_button) + layout.addWidget(next_button) + layout.addWidget(back_button) + layout.addWidget(add_button) + layout.addWidget(add_folder_button) + layout.addWidget(clear_button) + + progress_volume_layout = QHBoxLayout() + progress_volume_layout.addWidget(self.progress_bar) + progress_volume_layout.addWidget(self.volume_slider) + layout.addLayout(progress_volume_layout) + self.setLayout(layout) + + # Event handlers + def play_selected_song(self, item): + index = self.playlist_widget.row(item) + self.current_index = index + filepath = self.player.playlist[self.current_index] + self.player.load_midi(filepath) + self.player.play_midi() + pygame.mixer.music.set_endevent(pygame.USEREVENT) + filename = os.path.basename(filepath) + self.current_midi_label.setText(f"Current MIDI: {filename}") + + def load_midi_file(self): + filepath, _ = QFileDialog.getOpenFileName(self, "Select MIDI File", filter="MIDI files (*.mid *.midi)") + if filepath: + filename = os.path.basename(filepath) + self.player.load_midi(filepath) + self.current_midi_label.setText(f"Current MIDI: {filename}") + self.player.play_midi() + self.playlist_widget.addItem(filename) + self.player.add_to_playlist(filepath) + + def load_folder(self): + folder = QFileDialog.getExistingDirectory(self, "Select Folder") + if folder: + for file in os.listdir(folder): + if file.endswith((".midi", ".mid")): + filepath = os.path.join(folder, file) + self.playlist_widget.addItem(file) + self.player.add_to_playlist(filepath) # Only add to playlist, don't load immediately!!!!!!!!!!!!!!! + + #probably should be in the MidPlay class + + + def previous_song(self): + if self.player.playlist: + filepath = self.player.playlist[self.current_index] + filename = os.path.basename(filepath) + self.player.current_index = (self.player.current_index - 1) % len(self.player.playlist) + self.current_midi_label.setText(f"Current MIDI: {filename}") + self.player.play_midi() + + def clear_playlist(self): + self.player.clear_playlist() + self.playlist_widget.clear() + + def closeEvent(self, event): + confirmation = QMessageBox.question(self, "Exit Confirmation", "Are you sure you want to exit?", QMessageBox.Yes | QMessageBox.No) + if confirmation == QMessageBox.Yes: + pygame.mixer.quit() + pygame.quit() + event.accept() + else: + event.ignore() + +class MidPlay: + """The Heart of Midi Playback""" + + def __init__(self): + self.playlist = [] + self.current_midi = None + self.playing = False + self.current_index = 0 + + def load_midi(self, filepath: str) -> None: + def load(): + try: + self.current_midi = pretty_midi.PrettyMIDI(filepath) + pygame.mixer.music.load(filepath) + except Exception as e: + print(f"Error loading MIDI: {e}") + threading.Thread(target=load).start() + + def add_to_playlist(self, filepath: str) -> None: + self.playlist.append(filepath) + + def clear_playlist(self) -> None: + self.playlist = [] + + def play_midi(self) -> None: + def play(): + if self.current_midi: + self.current_midi.instruments[0].synthesize() + pygame.mixer.music.play() + self.playing = True + pygame.mixer.music.set_endevent(pygame.USEREVENT) + else: + print("No MIDI file loaded") + threading.Thread(target=play).start() + + def pause(self) -> None: + pygame.mixer.music.pause() + self.playing = False + + def stop(self) -> None: + pygame.mixer.music.stop() + self.playing = False + + + def next_song(self) -> None: + #print("Debug: next_song() called", self.playlist) debug line + if self.playlist: + self.current_index = (self.current_index + 1) % len(self.playlist) + filepath = self.playlist[self.current_index] + + # If a new MIDI was loaded before the last one ended, respect that as the new playlist start + if self.current_midi and self.playing: + # print("Debug: New MIDI loaded before last one ended") # debug line + self.load_midi(filepath) + self.play_midi() + # print("Debug: Filepath:", filepath) # debug line + # print("Debug: Current MIDI:", self.current_midi) # debug line + +if __name__ == '__main__': + app = QApplication([]) + player_gui = MidPlayGUI() + player_gui.show() + running = True + while True: + for event in pygame.event.get(): + if event.type == pygame.USEREVENT: + player_gui.player.next_song() + if event.type == pygame.QUIT: + running = False + break + app.exec_() \ No newline at end of file diff --git a/ScanOrg.py b/ScanOrg.py new file mode 100755 index 0000000..1a48c33 --- /dev/null +++ b/ScanOrg.py @@ -0,0 +1,213 @@ +#Path: ScanOrg.py +# Description: A class to scan and organize music files + +import concurrent.futures +import threading +import queue +import zipfile +import py7zr +import rarfile +import os +import mutagen +from PyQt5.QtCore import Qt, QSortFilterProxyModel, QAbstractTableModel, QModelIndex, QVariant, QAbstractItemModel, QFileInfo, QDir, QMimeDatabase, QMimeData, QUrl, QItemSelectionModel, QItemSelection, QItemSelectionRange, QObject, QThread, QTimer, QEventLoop, QCoreApplication, QUrl, pyqtSignal + +# Directory Filter Proxy Model +class DirectoryFilterProxyModel(QSortFilterProxyModel): + def __init__(self): + super().__init__() + self.setFilterCaseSensitivity(Qt.CaseInsensitive) + self.setFilterKeyColumn(0) + def filterAcceptsRow(self, source_row, source_parent): + index = self.sourceModel().index(source_row, 0, source_parent) + return self.sourceModel().isDir(index) + +# File Filter Proxy Model +class FileFilterProxyModel(QSortFilterProxyModel): + def __init__(self): + super().__init__() + self.setFilterCaseSensitivity(Qt.CaseInsensitive) + self.setFilterKeyColumn(0) + self.allowed_extensions = ['.zip', '.mp3', '.wav', '.flac', '.mid', '.midi', '.aiff', '.aif', '.aifc', '.au', '.snd', '.wv', '.wma', '.m4a'] + + def filterAcceptsRow(self, source_row, source_parent): + index = self.sourceModel().index(source_row, 0, source_parent) + if self.sourceModel().isDir(index): + return True + else: + return self.sourceModel().fileName(index).endswith(tuple(self.allowed_extensions)) + +# File Scan and Organize +class file_scanner: + def __init__(self): + self.file_list = [] + self.cache = {} + + def scan(self, path): + def background_scan(self, path): + if path in self.cache: + return self.cache[path] + + file_list = [] + dirs_queue = queue.Queue() + dirs_queue.put(path) + + while not dirs_queue.empty(): + current_path = dirs_queue.get() + try: + for root, dirs, files in os.walk(current_path): + for dir in dirs: + dirs_queue.put(os.path.join(root, dir)) + for file in files: + if file.endswith(('.mp3', '.wav', '.flac', '.mid', '.midi', '.aiff', '.aif', '.aifc', '.au', '.snd', '.wv', '.wma', '.m4a')): + file_list.append(os.path.join(root, file)) + self.cache[current_path] = file_list + except (IOError, PermissionError, FileNotFoundError, OSError) as e: + print(f"Error Scanning Files: {e}") + + return file_list + + file_list = [] + thread = threading.Thread(target=background_scan, args=(path, file_list)) + thread.start() + return file_list + + def get_file_list(self): + return self.file_list + + def clear_file_list(self): + self.file_list = [] + +class Extractor: + def zipviewer(self, index, file_filter_model, list_model, extraction_directory): + if index.isValid() and extraction_directory is not None: + index = file_filter_model.mapToSource(index) + file_path = list_model.filePath(index) + + try: + if file_path.endswith(".zip"): + with zipfile.ZipFile(file_path, 'r') as zip_ref: + self._extract_files(zip_ref, extraction_directory) + elif file_path.endswith(".rar"): + with rarfile.RarFile(file_path, 'r') as rar_ref: + self._extract_files(rar_ref, extraction_directory) + elif file_path.endswith(".7z"): + with py7zr.SevenZipFile(file_path, 'r') as sevenzip_ref: + self._extract_files(sevenzip_ref, extraction_directory) + else: + print(f"Unsupported file format: {file_path}") + + except (zipfile.BadZipFile, zipfile.LargeZipFile) as e: + print(f"ZIP Extraction Error: {e}") + except (rarfile.RarFileException, rarfile.NotRARFile) as e: + print(f"RAR Extraction Error: {e}") + except py7zr.exceptions.SevenZipException as e: + print(f"7z Extraction Error: {e}") + except OSError as e: + print(f"Extraction Error: {e}") + + def _extract_files(self, archive_ref, extraction_directory): + for filename in archive_ref.namelist(): + destination = os.path.join(extraction_directory, filename) + if os.path.isfile(destination): + print(f"File already exists: {destination}") + else: + os.makedirs(os.path.dirname(destination), exist_ok=True) + with open(destination, 'wb') as file: + file.write(archive_ref.read(filename)) + print(f"Extracted: {filename}") + +class organizer: + global metadata_queue + metadata_queue = queue.Queue() + def __init__(self): + self.file_list = [] + self.artist_list = [] + self.album_list = [] + self.genre_list = [] + self.year_list = [] + self.file_scanner = file_scanner() + self.file_info_cache = {} + + def scan(self, path): + if path in self.file_scanner.cache: + self.file_list = self.file_scanner.cache[path] + else: + self.file_list = self.file_scanner.scan(path) + + def get_file_list(self): + return self.file_list + def clear_file_list(self): + self.file_list = [] + def get_artist_list(self): + return self.artist_list + def get_album_list(self): + return self.album_list + def get_genre_list(self): + return self.genre_list + def get_year_list(self): + return self.year_list + def clear_artist_list(self): + self.artist_list = [] + def clear_album_list(self): + self.album_list = [] + def clear_genre_list(self): + self.genre_list = [] + def clear_year_list(self): + self.year_list = [] + + def organize(self): + results_queue = queue.Queue() + metadata = pyqtSignal(dict) + with concurrent.futures.ThreadPoolExecutor() as executor: + futures = [] + for file in self.file_list: + futures.append(executor.submit(self.get_file_info, file, results_queue)) + + for future in concurrent.futures.as_completed(futures): + try: + metadata = future.result() + if metadata['artist'] not in self.artist_list: + self.artist_list.append(metadata['artist']) + if metadata['album'] not in self.album_list: + self.album_list.append(metadata['album']) + if metadata['genre'] not in self.genre_list: + self.genre_list.append(metadata['genre']) + if metadata['year'] not in self.year_list: + self.year_list.append(metadata['year']) + except mutagen.mp3.HeaderNotFoundError: + print('Error: ' + file) + continue + while not metadata_queue.put(metadata): + pass + + def get_file_info(self, file, results_queue): + try: + audio = mutagen.File(file) + artist = audio['artist'][0] + album = audio['album'][0] + genre = audio['genre'][0] + year = audio['date'][0] + if artist not in self.artist_list: + self.artist_list.append(artist) + if album not in self.album_list: + self.album_list.append(album) + if genre not in self.genre_list: + self.genre_list.append(genre) + if year not in self.year_list: + self.year_list.append(year) + metadata = { + 'artist': artist, + 'album': album, + 'genre': genre, + 'year': year + } + self.metadata_extracted.emit(metadata) + except Exception as e: + results_queue.put(None) + print('Error: ' + file) + if os.path.splitext(file)[1] == ('.mp3', '.wav', '.flac', '.m4a', '.wma', 'mid', '.midi'): + self.organize_audio() + audio = mutagen.File(file) + + + \ No newline at end of file diff --git a/__pycache__/MidPlay.cpython-310.pyc b/__pycache__/MidPlay.cpython-310.pyc new file mode 100755 index 0000000..00219d1 Binary files /dev/null and b/__pycache__/MidPlay.cpython-310.pyc differ diff --git a/__pycache__/ScanOrg.cpython-310.pyc b/__pycache__/ScanOrg.cpython-310.pyc new file mode 100755 index 0000000..be7279a Binary files /dev/null and b/__pycache__/ScanOrg.cpython-310.pyc differ diff --git a/compression.py b/compression.py new file mode 100755 index 0000000..e228356 --- /dev/null +++ b/compression.py @@ -0,0 +1,94 @@ +# imports +import os +import zipfile +import rarfile +import py7zr +import shutil +import tarfile +import argparse +import tqdm +from concurrent.futures import ThreadPoolExecutor +from multiprocessing import pool + +# File Compressor +class Compressor: + def __init__(self): + pass + + def _compress_folder(self, source_path, archive_file, archive_format): + for root, _, files in os.walk(source_path): + for file in files: + file_path = os.path.join(root, file) + archive_path = os.path.relpath(file_path, source_path) + self._compress_file(file_path, archive_file, archive_path, archive_format) + + def _compress_file(self, file_path, archive_file, archive_path, archive_format): + + if archive_format == "zip": + with open(file_path, 'rb') as file: + for chunk in iter(lambda: file.read(1024 * 1024), b''): + archive_file.writestr(archive_path, chunk) + + elif archive_format == "tar": + archive_file.add(file_path, arcname=archive_path) + + elif archive_format == "7z": + archive_file.write(file_path, archive_path) + + else: + raise ValueError(f"Unsupported archive format: {archive_format}") + + def compress(self, source_path, archive_name, archive_format="zip"): + pbar = tqdm.tqdm(total=100, unit="B", unit_scale=True, desc="Compressing") + supported_formats = ["zip", "tar", "7z"] + + if archive_format not in supported_formats: + raise ValueError(f"Unsupported archive format: {archive_format}") + archive_path = os.path.join(os.path.dirname(source_path), f"{archive_name}.{archive_format}") + + # Check if source path exists + if not os.path.exists(source_path): + print(f"Source path does not exist: {source_path}") + return + + # Check if archive path already exists + if os.path.exists(archive_path): + print(f"Archive path already exists: {archive_path}") + return + + # Open archive file based on format + if archive_format == "zip": + archive_file = zipfile.ZipFile(archive_path, 'w', zipfile.ZIP_DEFLATED) + elif archive_format == "tar": + archive_file = tarfile.open(archive_path, mode="w") + elif archive_format == "7z": + archive_file = py7zr.SevenZipFile(archive_path, mode="w") + + # Compress the source path + try: + + if os.path.isdir(source_path): + self._compress_folder(source_path, archive_file, archive_format) + pbar.update(1) + else: + if os.path.isfile(source_path): + self._compress_file(source_path, archive_file, "", archive_format) + pbar.update(1) + else: + print(f"Source path is not a file or directory: {source_path}") + return + except Exception as e: + print(f"Compressed to: {archive_path} error:{e}") + + finally: + archive_file.close() # Ensure closing the archive file + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Compress files") + parser.add_argument("source", help="Path to the file or folder to compress") + parser.add_argument("archive_name", help="Name for the compressed archive") + parser.add_argument("-f", "--format", choices=["zip", "tar", "7z"], default="zip", help="Archive format") + args = parser.parse_args() + + compressor = Compressor() + compressor.compress(args.source, args.archive_name, args.format) diff --git a/docker-compose.debug.yml b/docker-compose.debug.yml new file mode 100755 index 0000000..bf8c4d0 --- /dev/null +++ b/docker-compose.debug.yml @@ -0,0 +1,11 @@ +version: '3.4' + +services: + fbrowser: + image: fbrowser + build: + context: . + dockerfile: ./Dockerfile + command: ["sh", "-c", "pip install debugpy -t /tmp && python /tmp/debugpy --wait-for-client --listen 0.0.0.0:5678 Fbrowser.py "] + ports: + - 5678:5678 diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100755 index 0000000..a84be70 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,8 @@ +version: '3.4' + +services: + fbrowser: + image: fbrowser + build: + context: . + dockerfile: ./Dockerfile diff --git a/extraction.py b/extraction.py new file mode 100755 index 0000000..70e509d --- /dev/null +++ b/extraction.py @@ -0,0 +1,74 @@ +# extractor.py +import os +import zipfile +import rarfile +import py7zr +import argparse +from tqdm import tqdm +from concurrent.futures import ThreadPoolExecutor + +class Extractor: + + def zipviewer(self, source, destination): + print(f"checking if {source} exists") + if not os.path.exists(source): + print(f"Error: Archive file not found: {source}") + return + + try: + + print(f"checking if {destination} exists") + if not os.path.exists(destination): + print(f"{destination} does not exist, creating {destination}") + os.makedirs(destination) + print(f"{destination} created") + else: + print(f"{destination} exists") + + print(f"checking if {source} is a valid archive file") + if source.endswith(".zip"): + print(f"Extracting all files from {source} to {destination}") + with zipfile.ZipFile(source, 'r') as zip_ref: + zip_ref.extractall(destination) + print(f"Extracted all files from {source} to {destination}") + + elif source.endswith(".rar, .tar.gz, .tar.bz2, .tar.xz, .tar.zst"): + with rarfile.RarFile(source, 'r') as rar_ref: + rar_ref.extractall(destination) + print(f"Extracted all files from {source} to {destination}") + + elif source.endswith(".7z"): + with py7zr.SevenZipFile(source, 'r') as sevenzip_ref: + sevenzip_ref.extractall(destination) + print(f"Extracted all files from {source} to {destination}") + + else: + print(f"Unsupported file format: {source}") + + except (zipfile.BadZipFile, zipfile.LargeZipFile) as e: + print(f"ZIP Extraction Error: {e}") + except (rarfile.RarFileException, rarfile.NotRARFile) as e: + print(f"RAR Extraction Error: {e}") + except py7zr.exceptions.SevenZipException as e: + print(f"7z Extraction Error: {e}") + except OSError as e: + print(f"Extraction Error: {e}") + +def main(): + print("Welcome to the Archive Extractor!") + parser = argparse.ArgumentParser(description="Compress or extract files") + subparsers = parser.add_subparsers(title="Command", dest="command") + + # Subparser for extraction + extract_parser = subparsers.add_parser("extract") + extract_parser.add_argument("source", help="Path to the archive file") + extract_parser.add_argument("destination", help="Extraction directory") + args = parser.parse_args() + + if args.command == "extract": + print(f"Extracting {args.source} to {args.destination}") + extractor = Extractor() + extractor.zipviewer(args.source, args.destination) + +if __name__ == "__main__": + main() diff --git a/paq-8l_intel.exe b/paq-8l_intel.exe new file mode 100755 index 0000000..f6f3684 Binary files /dev/null and b/paq-8l_intel.exe differ diff --git a/paq7asm-x86_64.asm b/paq7asm-x86_64.asm new file mode 100755 index 0000000..a0754a6 --- /dev/null +++ b/paq7asm-x86_64.asm @@ -0,0 +1,102 @@ +; YASM x86-64 assembly language code for PAQ7/8 ver. 2, Jan 18, 2007 +; +; (C) 2005-2007, Matt Mahoney, Matthew Fite. +; This is free software under GPL, http://www.gnu.org/licenses/gpl.txt +; +; This code was tested on an Athlon-64 under Ubuntu Linux 2.6.15.27.amd64-generic +; with paq8f and paq8jd. It should work with any PAQ version since paq7, +; because all versions use the same paq7asm.asm code for 32 bit Windows/Linux +; versions. To compile e.g. paq8jd in Linux: +; +; yasm paq7asm-x86_64.asm -f elf -m amd64 +; g++ -O3 -s -fomit-frame-pointer -DUNIX paq8jd.cpp paq7asm-x86_64.o -o paq8jd +; +; This code has not been tested in Windows. (You would need XP Professional +; 64 bit edition and a 64 bit compiler). + +section .text + +BITS 64 + +; Vector product a*b of n signed words, returning signed dword scaled +; down by 8 bits. n is rounded up to a multiple of 8. + + global dot_product ; (short* a, short* b, int n) + align 16 +dot_product: + mov rcx, rdx ; n + mov rax, rdi ; a + mov rdx, rsi ; b + add rcx, 7 ; n rounding up + and rcx, -8 + jz .done + sub rax, 16 + sub rdx, 16 + pxor xmm0, xmm0 ; sum = 0 +.loop: ; each loop sums 4 products + movdqa xmm1, [rax+rcx*2] ; put parital sums of vector product in xmm1 + pmaddwd xmm1, [rdx+rcx*2] + psrad xmm1, 8 + paddd xmm0, xmm1 + sub rcx, 8 + ja .loop + movdqa xmm1, xmm0 ; add 4 parts of xmm0 and return in eax + psrldq xmm1, 8 + paddd xmm0, xmm1 + movdqa xmm1, xmm0 + psrldq xmm1, 4 + paddd xmm0, xmm1 + movd rax, xmm0 +.done + ret + +; Train n neural network weights w[n] on inputs t[n] and err. +; w[i] += (t[i]*err*2 >> 16)+1 >> 1 bounded to +- 32K. +; n is rounded up to a multiple of 8. + +;1st arg rdi -> *t +;2nd arg rsi -> *w +;3rd arg rdx -> n +;4th arg rcx -> err (signed 16 bits) + + global train ; (short* t, short* w, int n, int err) + BITS 64 + align 16 +train: + mov rax, rcx ; err + and rax, 0xffff ; put 8 copies of err in xmm0 + movd xmm0, rax + movd xmm1, rax + pslldq xmm1, 2 + por xmm0, xmm1 + movdqa xmm1, xmm0 + pslldq xmm1, 4 + por xmm0, xmm1 + movdqa xmm1, xmm0 + pslldq xmm1, 8 + por xmm0, xmm1; + pcmpeqb xmm1, xmm1 ; 8 copies of 1 in xmm1 + psrlw xmm1, 15 + mov rcx, rdx ; n + mov rax, rdi ; t + mov rdx, rsi ; w + add rcx, 7 ; n/8 rounding up + and rcx, -8 + sub rax, 16 + sub rdx, 16 + jz .done + align 16 +.loop: ; each iteration adjusts 8 weights + movdqa xmm2, [rdx+rcx*2] ; w[i] + movdqa xmm3, [rax+rcx*2] ; t[i] + paddsw xmm3, xmm3 ; t[i]*2 + pmulhw xmm3, xmm0 ; t[i]*err*2 >> 16 + paddsw xmm3, xmm1 ; (t[i]*err*2 >> 16)+1 + psraw xmm3, 1 ; (t[i]*err*2 >> 16)+1 >> 1 + paddsw xmm2, xmm3 ; w[i] + xmm3 + movdqa [rdx+rcx*2], xmm2 + sub rcx, 8 + ja .loop +.done: + ret + diff --git a/paq7asm.asm b/paq7asm.asm new file mode 100755 index 0000000..82d55a7 --- /dev/null +++ b/paq7asm.asm @@ -0,0 +1,140 @@ +; NASM assembly language code for PAQ7. +; (C) 2005, Matt Mahoney. +; This is free software under GPL, http://www.gnu.org/licenses/gpl.txt +; +; MINGW g++: nasm paq7asm.asm -f win32 --prefix _ +; DJGPP g++: nasm paq7asm.asm -f coff --prefix _ +; Borland, Mars: nasm paq7asm.asm -f obj --prefix _ +; Linux: nasm paq7asm.asm -f elf +; +; For other Windows compilers try -f win32 or -f obj. Some old versions +; of Linux should use -f aout instead of -f elf. +; +; This code will only work on a Pentium-MMX or higher. It doesn't +; use extended (Katmai/SSE) instructions. It won't work +; in 64-bit mode. + +section .text use32 class=CODE + +; Reset after MMX +global do_emms +do_emms: + emms + ret + +; Vector product a*b of n signed words, returning signed dword scaled +; down by 8 bits. n is rounded up to a multiple of 8. + +global dot_product ; (short* a, short* b, int n) +align 16 +dot_product: + mov eax, [esp+4] ; a + mov edx, [esp+8] ; b + mov ecx, [esp+12] ; n + add ecx, 7 ; n rounding up + and ecx, -8 + jz .done + sub eax, 8 + sub edx, 8 + pxor mm0, mm0 ; sum = 0 +.loop: ; each loop sums 4 products + movq mm1, [eax+ecx*2] ; put halves of vector product in mm0 + pmaddwd mm1, [edx+ecx*2] + movq mm2, [eax+ecx*2-8] + pmaddwd mm2, [edx+ecx*2-8] + psrad mm1, 8 + psrad mm2, 8 + paddd mm0, mm1 + paddd mm0, mm2 + sub ecx, 8 + ja .loop + movq mm1, mm0 ; add 2 halves of mm0 and return in eax + psrlq mm1, 32 + paddd mm0, mm1 + movd eax, mm0 + emms +.done + ret + +; This should work on a Pentium 4 or higher in 32-bit mode, +; but it isn't much faster than the MMX version so I don't use it. + +global dot_product_sse2 ; (short* a, short* b, int n) +align 16 +dot_product_sse2: + mov eax, [esp+4] ; a + mov edx, [esp+8] ; b + mov ecx, [esp+12] ; n + add ecx, 7 ; n rounding up + and ecx, -8 + jz .done + sub eax, 16 + sub edx, 16 + pxor xmm0, xmm0 ; sum = 0 +.loop: ; each loop sums 4 products + movdqa xmm1, [eax+ecx*2] ; put parital sums of vector product in xmm0 + pmaddwd xmm1, [edx+ecx*2] + psrad xmm1, 8 + paddd xmm0, xmm1 + sub ecx, 8 + ja .loop + movdqa xmm1, xmm0 ; add 4 parts of xmm0 and return in eax + psrldq xmm1, 8 + paddd xmm0, xmm1 + movdqa xmm1, xmm0 + psrldq xmm1, 4 + paddd xmm0, xmm1 + movd eax, xmm0 +.done + ret + + +; Train n neural network weights w[n] on inputs t[n] and err. +; w[i] += t[i]*err*2+1 >> 17 bounded to +- 32K. +; n is rounded up to a multiple of 8. + +global train ; (short* t, short* w, int n, int err) +align 16 +train: + mov eax, [esp+16] ; err + and eax, 0xffff ; put 4 copies of err in mm0 + movd mm0, eax + movd mm1, eax + psllq mm1, 16 + por mm0, mm1 + movq mm1, mm0 + psllq mm1, 32 + por mm0, mm1 + pcmpeqb mm1, mm1 ; 4 copies of 1 in mm1 + psrlw mm1, 15 + mov eax, [esp+4] ; t + mov edx, [esp+8] ; w + mov ecx, [esp+12] ; n + add ecx, 7 ; n/8 rounding up + and ecx, -8 + sub eax, 8 + sub edx, 8 + jz .done +.loop: ; each iteration adjusts 8 weights + movq mm2, [edx+ecx*2] ; w[i] + movq mm3, [eax+ecx*2] ; t[i] + movq mm4, [edx+ecx*2-8] ; w[i] + movq mm5, [eax+ecx*2-8] ; t[i] + paddsw mm3, mm3 + paddsw mm5, mm5 + pmulhw mm3, mm0 + pmulhw mm5, mm0 + paddsw mm3, mm1 + paddsw mm5, mm1 + psraw mm3, 1 + psraw mm5, 1 + paddsw mm2, mm3 + paddsw mm4, mm5 + movq [edx+ecx*2], mm2 + movq [edx+ecx*2-8], mm4 + sub ecx, 8 + ja .loop +.done: + emms + ret + diff --git a/paq7asmsse.asm b/paq7asmsse.asm new file mode 100755 index 0000000..98ff613 --- /dev/null +++ b/paq7asmsse.asm @@ -0,0 +1,93 @@ +; NASM assembly language code for PAQ7. +; (C) 2005, Matt Mahoney. +; train - written by wowtiger, Jan. 30, 2007 +; +; This is free software under GPL, http://www.gnu.org/licenses/gpl.txt +; +; This code is a replacement for paq7asm.asm for newer processors +; supporting SSE2 instructions. It is about 1% faster than the +; equivalent MMX code. It can be linked with any version of paq7* +; or paq8*. Assemble as below, then link following the instructions +; in the C++ source code, replacing paq7asm.obj with paq7asmsse.obj. +; No C++ code changes are needed. +; +; MINGW g++: nasm paq7asmsse.asm -f win32 --prefix _ +; DJGPP g++: nasm paq7asmsse.asm -f coff --prefix _ +; Borland, Mars: nasm paq7asmsse.asm -f obj --prefix _ +; Linux: nasm paq7asmsse.asm -f elf +; + +section .text use32 class=CODE + +; Vector product a*b of n signed words, returning signed dword scaled +; down by 8 bits. n is rounded up to a multiple of 8. + +global dot_product ; (short* a, short* b, int n) +align 16 +dot_product: + mov eax, [esp+4] ; a + mov edx, [esp+8] ; b + mov ecx, [esp+12] ; n + add ecx, 7 ; n rounding up + and ecx, -8 + jz .done + sub eax, 16 + sub edx, 16 + pxor xmm0, xmm0 ; sum = 0 +.loop: ; each loop sums 4 products + movdqa xmm1, [eax+ecx*2] ; put parital sums of vector product in xmm0 + pmaddwd xmm1, [edx+ecx*2] + psrad xmm1, 8 + paddd xmm0, xmm1 + sub ecx, 8 + ja .loop + movdqa xmm1, xmm0 ; add 4 parts of xmm0 and return in eax + psrldq xmm1, 8 + paddd xmm0, xmm1 + movdqa xmm1, xmm0 + psrldq xmm1, 4 + paddd xmm0, xmm1 + movd eax, xmm0 +.done + ret + + +; Train n neural network weights w[n] on inputs t[n] and err. +; w[i] += t[i]*err*2+1 >> 17 bounded to +- 32K. +; n is rounded up to a multiple of 8. + +; Train for SSE2 +; Use this code to get some performance... + +global train ; (short* t, short* w, int n, int err) +align 16 +train: + mov eax, [esp+4] ; t + mov edx, [esp+8] ; w + mov ecx, [esp+12] ; n + add ecx, 7 ; n/8 rounding up + and ecx, -8 + jz .done + sub eax, 16 + sub edx, 16 + movd xmm0, [esp+16] + pshuflw xmm0,xmm0,0 + punpcklqdq xmm0,xmm0 +.loop: ; each iteration adjusts 8 weights + movdqa xmm3, [eax+ecx*2] ; t[i] + movdqa xmm2, [edx+ecx*2] ; w[i] + paddsw xmm3, xmm3 ; t[i]*2 + pmulhw xmm3, xmm0 ; t[i]*err*2 >> 16 + paddsw xmm3, [_mask] ; (t[i]*err*2 >> 16)+1 + psraw xmm3, 1 ; (t[i]*err*2 >> 16)+1 >> 1 + paddsw xmm2, xmm3 ; w[i] + xmm3 + movdqa [edx+ecx*2], xmm2 + sub ecx, 8 + ja .loop +.done: + ret + +align 16 +_mask dd 10001h,10001h,10001h,10001h ; 8 copies of 1 in xmm1 + + diff --git a/paq8l.cpp b/paq8l.cpp new file mode 100755 index 0000000..3f69df8 --- /dev/null +++ b/paq8l.cpp @@ -0,0 +1,3575 @@ +/* paq8l file compressor/archiver. Release by Matt Mahoney, Mar. 8, 2007. + Updated Apr. 15, 2007 (no change to paq8l.exe). + + Copyright (C) 2006 Matt Mahoney, Serge Osnach, Alexander Ratushnyak, + Bill Pettis, Przemyslaw Skibinski, Matthew Fite, wowtiger, Andrew Paterson, + + + LICENSE + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details at + Visit . + +To install and use in Windows: + +- To install, put paq8l.exe or a shortcut to it on your desktop. +- To compress a file or folder, drop it on the paq8l icon. +- To decompress, drop a .paq8l file on the icon. + +A .paq8l extension is added for compression, removed for decompression. +The output will go in the same folder as the input. + +While paq8l is working, a command window will appear and report +progress. When it is done you can close the window by pressing +ENTER or clicking [X]. + + +COMMAND LINE INTERFACE + +- To install, put paq8l.exe somewhere in your PATH. +- To compress: paq8l [-N] file1 [file2...] +- To decompress: paq8l [-d] file1.paq8l [dir2] +- To view contents: more < file1.paq8l + +The compressed output file is named by adding ".paq8l" extension to +the first named file (file1.paq8l). Each file that exists will be +added to the archive and its name will be stored without a path. +The option -N specifies a compression level ranging from -0 +(fastest) to -9 (smallest). The default is -5. If there is +no option and only one file, then the program will pause when +finished until you press the ENTER key (to support drag and drop). +If file1.paq8l exists then it is overwritten. + +If the first named file ends in ".paq8l" then it is assumed to be +an archive and the files within are extracted to the same directory +as the archive unless a different directory (dir2) is specified. +The -d option forces extraction even if there is not a ".paq8l" +extension. If any output file already exists, then it is compared +with the archive content and the first byte that differs is reported. +No files are overwritten or deleted. If there is only one argument +(no -d or dir2) then the program will pause when finished until +you press ENTER. + +For compression, if any named file is actually a directory, then all +files and subdirectories are compressed, preserving the directory +structure, except that empty directories are not stored, and file +attributes (timestamps, permissions, etc.) are not preserved. +During extraction, directories are created as needed. For example: + + paq8l -4 c:\tmp\foo bar + +compresses foo and bar (if they exist) to c:\tmp\foo.paq8l at level 4. + + paq8l -d c:\tmp\foo.paq8l . + +extracts foo and compares bar in the current directory. If foo and bar +are directories then their contents are extracted/compared. + +There are no commands to update an existing archive or to extract +part of an archive. Files and archives larger than 2GB are not +supported (but might work on 64-bit machines, not tested). +File names with nonprintable characters are not supported (spaces +are OK). + + +TO COMPILE + +There are 2 files: paq8l.cpp (C++) and paq7asm.asm (NASM/YASM). +paq7asm.asm is the same as in paq7 and paq8x. paq8l.cpp recognizes the +following compiler options: + + -DWINDOWS (to compile in Windows) + -DUNIX (to compile in Unix, Linux, Solairs, MacOS/Darwin, etc) + -DNOASM (to replace paq7asm.asm with equivalent C++) + -DDEFAULT_OPTION=N (to change the default compression level from 5 to N). + +If you compile without -DWINDOWS or -DUNIX, you can still compress files, +but you cannot compress directories or create them during extraction. +You can extract directories if you manually create the empty directories +first. + +Use -DEFAULT_OPTION=N to change the default compression level to support +drag and drop on machines with less than 256 MB of memory. Use +-DDEFAULT_OPTION=4 for 128 MB, 3 for 64 MB, 2 for 32 MB, etc. + +Use -DNOASM for non x86-32 machines, or older than a Pentium-MMX (about +1997), or if you don't have NASM or YASM to assemble paq7asm.asm. The +program will still work but it will be slower. For NASM in Windows, +use the options "--prefix _" and either "-f win32" or "-f obj" depending +on your C++ compiler. In Linux, use "-f elf". + +Recommended compiler commands and optimizations: + + MINGW g++: + nasm paq7asm.asm -f win32 --prefix _ + g++ paq8l.cpp -DWINDOWS -O2 -Os -s -march=pentiumpro -fomit-frame-pointer -o paq8l.exe paq7asm.obj + + Borland: + nasm paq7asm.asm -f obj --prefix _ + bcc32 -DWINDOWS -O -w-8027 paq8l.cpp paq7asm.obj + + Mars: + nasm paq7asm.asm -f obj --prefix _ + dmc -DWINDOWS -Ae -O paq8l.cpp paq7asm.obj + + UNIX/Linux (PC): + nasm -f elf paq7asm.asm + g++ paq8l.cpp -DUNIX -O2 -Os -s -march=pentiumpro -fomit-frame-pointer -o paq8l paq7asm.o + + Non PC (e.g. PowerPC under MacOS X) + g++ paq8l.cpp -O2 -DUNIX -DNOASM -s -o paq8l + +MinGW produces faster executables than Borland or Mars, but Intel 9 +is about 4% faster than MinGW). + + +ARCHIVE FILE FORMAT + +An archive has the following format. It is intended to be both +human and machine readable. The header ends with CTRL-Z (Windows EOF) +so that the binary compressed data is not displayed on the screen. + + paq8l -N CR LF + size TAB filename CR LF + size TAB filename CR LF + ... + CTRL-Z + compressed binary data + +-N is the option (-0 to -9), even if a default was used. +Plain file names are stored without a path. Files in compressed +directories are stored with path relative to the compressed directory +(using UNIX style forward slashes "/"). For example, given these files: + + 123 C:\dir1\file1.txt + 456 C:\dir2\file2.txt + +Then + + paq8l archive \dir1\file1.txt \dir2 + +will create archive.paq8l with the header: + + paq8l -5 + 123 file1.txt + 456 dir2/file2.txt + +The command: + + paq8l archive.paq8l C:\dir3 + +will create the files: + + C:\dir3\file1.txt + C:\dir3\dir2\file2.txt + +Decompression will fail if the first 7 bytes are not "paq8l -". Sizes +are stored as decimal numbers. CR, LF, TAB, CTRL-Z are ASCII codes +13, 10, 9, 26 respectively. + + +ARITHMETIC CODING + +The binary data is arithmetic coded as the shortest base 256 fixed point +number x = SUM_i x_i 256^-1-i such that p(= 16. + + The primaty output is t_i := stretch(sm(n0,n1,h)), where sm(.) is + a stationary map with K = 1/256, initiaized to + sm(n0,n1,h) = (n1+(1/64))/(n+2/64). Four additional inputs are also + be computed to improve compression slightly: + + p1_i = sm(n0,n1,h) + p0_i = 1 - p1_i + t_i := stretch(p_1) + t_i+1 := K1 (p1_i - p0_i) + t_i+2 := K2 stretch(p1) if n0 = 0, -K2 stretch(p1) if n1 = 0, else 0 + t_i+3 := K3 (-p0_i if n1 = 0, p1_i if n0 = 0, else 0) + t_i+4 := K3 (-p0_i if n0 = 0, p1_i if n1 = 0, else 0) + + where K1..K4 are ad-hoc constants. + + h is updated as follows: + If n < 4, append y_j to h. + Else if n <= 16, set h := y_j. + Else h = 0. + + The update rule is biased toward newer data in a way that allows + n0 or n1, but not both, to grow large by discarding counts of the + opposite bit. Large counts are incremented probabilistically. + Specifically, when y_j = 0 then the update rule is: + + n0 := n0 + 1, n < 29 + n0 + 1 with probability 2^(27-n0)/2 else n0, 29 <= n0 < 41 + n0, n = 41. + n1 := n1, n1 <= 5 + round(8/3 lg n1), if n1 > 5 + + swapping (n0,n1) when y_j = 1. + + Furthermore, to allow an 8 bit representation for (n0,n1,h), states + exceeding the following values of n0 or n1 are replaced with the + state with the closest ratio n0:n1 obtained by decrementing the + smaller count: (41,0,h), (40,1,h), (12,2,h), (5,3,h), (4,4,h), + (3,5,h), (2,12,h), (1,40,h), (0,41,h). For example: + (12,2,1) 0-> (7,1,0) because there is no state (13,2,0). + +- Match Model. The state is (c,b), initially (0,0), where c is 1 if + the context was previously seen, else 0, and b is the next bit in + this context. The prediction is: + + t_i := (2b - 1)Kc log(m + 1) + + where m is the length of the context. The update rule is c := 1, + b := y_j. A match model can be implemented efficiently by storing + input in a buffer and storing pointers into the buffer into a hash + table indexed by context. Then c is indicated by a hash table entry + and b can be retrieved from the buffer. + + +CONTEXTS + +High compression is achieved by combining a large number of contexts. +Most (not all) contexts start on a byte boundary and end on the bit +immediately preceding the predicted bit. The contexts below are +modeled with both a run map and a nonstationary map unless indicated. + +- Order n. The last n bytes, up to about 16. For general purpose data. + Most of the compression occurs here for orders up to about 6. + An order 0 context includes only the 0-7 bits of the partially coded + byte and the number of these bits (255 possible values). + +- Sparse. Usually 1 or 2 of the last 8 bytes preceding the byte containing + the predicted bit, e.g (2), (3),..., (8), (1,3), (1,4), (1,5), (1,6), + (2,3), (2,4), (3,6), (4,8). The ordinary order 1 and 2 context, (1) + or (1,2) are included above. Useful for binary data. + +- Text. Contexts consists of whole words (a-z, converted to lower case + and skipping other values). Contexts may be sparse, e.g (0,2) meaning + the current (partially coded) word and the second word preceding the + current one. Useful contexts are (0), (0,1), (0,1,2), (0,2), (0,3), + (0,4). The preceding byte may or may not be included as context in the + current word. + +- Formatted text. The column number (determined by the position of + the last linefeed) is combined with other contexts: the charater to + the left and the character above it. + +- Fixed record length. The record length is determined by searching for + byte sequences with a uniform stride length. Once this is found, then + the record length is combined with the context of the bytes immediately + preceding it and the corresponding byte locations in the previous + one or two records (as with formatted text). + +- Context gap. The distance to the previous occurrence of the order 1 + or order 2 context is combined with other low order (1-2) contexts. + +- FAX. For 2-level bitmapped images. Contexts are the surrounding + pixels already seen. Image width is assumed to be 1728 bits (as + in calgary/pic). + +- Image. For uncompressed 24-bit color BMP and TIFF images. Contexts + are the high order bits of the surrounding pixels and linear + combinations of those pixels, including other color planes. The + image width is detected from the file header. When an image is + detected, other models are turned off to improve speed. + +- JPEG. Files are further compressed by partially uncompressing back + to the DCT coefficients to provide context for the next Huffman code. + Only baseline DCT-Huffman coded files are modeled. (This ia about + 90% of images, the others are usually progresssive coded). JPEG images + embedded in other files (quite common) are detected by headers. The + baseline JPEG coding process is: + - Convert to grayscale and 2 chroma colorspace. + - Sometimes downsample the chroma images 2:1 or 4:1 in X and/or Y. + - Divide each of the 3 images into 8x8 blocks. + - Convert using 2-D discrete cosine transform (DCT) to 64 12-bit signed + coefficients. + - Quantize the coefficients by integer division (lossy). + - Split the image into horizontal slices coded independently, separated + by restart codes. + - Scan each block starting with the DC (0,0) coefficient in zigzag order + to the (7,7) coefficient, interleaving the 3 color components in + order to scan the whole image left to right starting at the top. + - Subtract the previous DC component from the current in each color. + - Code the coefficients using RS codes, where R is a run of R zeros (0-15) + and S indicates 0-11 bits of a signed value to follow. (There is a + special RS code (EOB) to indicate the rest of the 64 coefficients are 0). + - Huffman code the RS symbol, followed by S literal bits. + The most useful contexts are the current partially coded Huffman code + (including S following bits) combined with the coefficient position + (0-63), color (0-2), and last few RS codes. + +- Match. When a context match of 400 bytes or longer is detected, + the next bit of the match is predicted and other models are turned + off to improve speed. + +- Exe. When a x86 file (.exe, .obj, .dll) is detected, sparse contexts + with gaps of 1-12 selecting only the prefix, opcode, and the bits + of the modR/M byte that are relevant to parsing are selected. + This model is turned off otherwise. + +- Indirect. The history of the last 1-3 bytes in the context of the + last 1-2 bytes is combined with this 1-2 byte context. + +- DMC. A bitwise n-th order context is built from a state machine using + DMC, described in http://plg.uwaterloo.ca/~ftp/dmc/dmc.c + The effect is to extend a single context, one bit at a time and predict + the next bit based on the history in this context. The model here differs + in that two predictors are used. One is a pair of counts as in the original + DMC. The second predictor is a bit history state mapped adaptively to + a probability as as in a Nonstationary Map. + +ARCHITECTURE + +The context models are mixed by several of several hundred neural networks +selected by a low-order context. The outputs of these networks are +combined using a second neural network, then fed through several stages of +adaptive probability maps (APM) before arithmetic coding. + +For images, only one neural network is used and its context is fixed. + +An APM is a stationary map combining a context and an input probability. +The input probability is stretched and divided into 32 segments to +combine with other contexts. The output is interpolated between two +adjacent quantized values of stretch(p1). There are 2 APM stages in series: + + p1 := (p1 + 3 APM(order 0, p1)) / 4. + p1 := (APM(order 1, p1) + 2 APM(order 2, p1) + APM(order 3, p1)) / 4. + +PREPROCESSING + +paq8l uses preprocessing transforms on certain data types to improve +compression. To improve reliability, the decoding transform is +tested during compression to ensure that the input file can be +restored. If the decoder output is not identical to the input file +due to a bug, then the transform is abandoned and the data is compressed +without a transform so that it will still decompress correctly. + +The input is split into blocks with the format +where is 1 byte (0 = no transform), is the size +of the data after decoding, which may be different than the size of . +Blocks do not span file boundaries, and have a maximum size of 4MB to +2GB depending on compression level. Large files are split into blocks +of this size. The preprocessor has 3 parts: + +- Detector. Splits the input into smaller blocks depending on data type. + +- Coder. Input is a block to be compressed. Output is a temporary + file. The coder determines whether a transform is to be applied + based on file type, and if so, which one. A coder may use lots + of resources (memory, time) and make multiple passes through the + input file. The file type is stored (as one byte) during compression. + +- Decoder. Performs the inverse transform of the coder. It uses few + resorces (fast, low memory) and runs in a single pass (stream oriented). + It takes input either from a file or the arithmetic decoder. Each call + to the decoder returns a single decoded byte. + +The following transforms are used: + +- EXE: CALL (0xE8) and JMP (0xE9) address operands are converted from + relative to absolute address. The transform is to replace the sequence + E8/E9 xx xx xx 00/FF by adding file offset modulo 2^25 (signed range, + little-endian format). Data to transform is identified by trying the + transform and applying a crude compression test: testing whether the + byte following the E8/E8 (LSB of the address) occurred more recently + in the transformed data than the original and within 4KB 4 times in + a row. The block ends when this does not happen for 4KB. + +- JPEG: detected by SOI and SOF and ending with EOI or any nondecodable + data. No transform is applied. The purpose is to separate images + embedded in execuables to block the EXE transform, and for a future + place to insert a transform. + + +IMPLEMENTATION + +Hash tables are designed to minimize cache misses, which consume most +of the CPU time. + +Most of the memory is used by the nonstationary context models. +Contexts are represented by 32 bits, possibly a hash. These are +mapped to a bit history, represented by 1 byte. The hash table is +organized into 64-byte buckets on cache line boundaries. Each bucket +contains 7 x 7 bit histories, 7 16-bit checksums, and a 2 element LRU +queue packed into one byte. Each 7 byte element represents 7 histories +for a context ending on a 3-bit boundary plus 0-2 more bits. One +element (for bits 0-1, which have 4 unused bytes) also contains a run model +consisting of the last byte seen and a count (as 1 byte each). + +Run models use 4 byte hash elements consisting of a 2 byte checksum, a +repeat count (0-255) and the byte value. The count also serves as +a priority. + +Stationary models are most appropriate for small contexts, so the +context is used as a direct table lookup without hashing. + +The match model maintains a pointer to the last match until a mismatching +bit is found. At the start of the next byte, the hash table is referenced +to find another match. The hash table of pointers is updated after each +whole byte. There is no checksum. Collisions are detected by comparing +the current and matched context in a rotating buffer. + +The inner loops of the neural network prediction (1) and training (2) +algorithms are implemented in MMX assembler, which computes 4 elements +at a time. Using assembler is 8 times faster than C++ for this code +and 1/3 faster overall. (However I found that SSE2 code on an AMD-64, +which computes 8 elements at a time, is not any faster). + + +DIFFERENCES FROM PAQ7 + +An .exe model and filter are added. Context maps are improved using 16-bit +checksums to reduce collisions. The state table uses probabilistic updates +for large counts, more states that remember the last bit, and decreased +discounting of the opposite count. It is implemented as a fixed table. +There are also many minor changes. + +DIFFERENCES FROM PAQ8A + +The user interface supports directory compression and drag and drop. +The preprocessor segments the input into blocks and uses more robust +EXE detection. An indirect context model was added. There is no +dictionary preprocesor like PAQ8B/C/D/E. + +DIFFERENCES FROM PAQ8F + +Different models, usually from paq8hp*. Also changed rate from 8 to 7. A bug +in Array was fixed that caused the program to silently crash upon exit. + +DIFFERENCES FROM PAQ8J + +1) Slightly improved sparse model. +2) Added new family of sparse contexts. Each byte mapped to 3-bit value, where +different values corresponds to different byte classes. For example, input +byte 0x00 transformed into 0, all bytes that less then 16 -- into 5, all +punctuation marks (ispunct(c)!=0) -- into 2 etc. Then this flags from 11 +previous bytes combined into 32-bit pseudo-context. + +All this improvements gives only 62 byte on BOOK1, but on binaries archive size +reduced on 1-2%. + +DIFFERENCES FROM PAQ8JA + +Introduced distance model. Distance model uses distance to last occurence +of some anchor char ( 0x00, space, newline, 0xff ), combined with previous +charactes as context. This slightly improves compression of files with +variable-width record data. + +DIFFERENCES FROM PAQ8JB + +Restored recordModel(), broken in paq8hp*. Slightly tuned indirectModel(). + +DIFFERENCES FROM PAQ8JC + +Changed the APMs in the Predictor. Up to a 0.2% improvement for some files. + +DIFFERENCES FROM PAQ8JD + +Added DMCModel. Removed some redundant models from SparseModel and other +minor tuneups. Changes introduced in PAQ8K were not carried over. + +PAQ8L v.2 + +Changed Mixer::p() to p() to fix a compiler error in Linux +(patched by Indrek Kruusa, Apr. 15, 2007). + +*/ + +#define PROGNAME "paq8l" // Please change this if you change the program. + +#include +#include +#include +#include +#include +#include +#define NDEBUG // remove for debugging (turns on Array bound checks) +#include + +#ifdef UNIX +#include +#include +#include +#include +#endif + +#ifdef WINDOWS +#include +#endif + +#ifndef DEFAULT_OPTION +#define DEFAULT_OPTION 5 +#endif + +// 8, 16, 32 bit unsigned types (adjust as appropriate) +typedef unsigned char U8; +typedef unsigned short U16; +typedef unsigned int U32; + +// min, max functions +#ifndef WINDOWS +inline int min(int a, int b) {return a='A'&&c1<='Z') c1+='a'-'A'; + int c2=*b; + if (c2>='A'&&c2<='Z') c2+='a'-'A'; + if (c1!=c2) return 0; + ++a; + ++b; + } + return *a==*b; +} + +//////////////////////// Program Checker ///////////////////// + +// Track time and memory used +class ProgramChecker { + int memused; // bytes allocated by Array now + int maxmem; // most bytes allocated ever + clock_t start_time; // in ticks +public: + void alloc(int n) { // report memory allocated, may be negative + memused+=n; + if (memused>maxmem) maxmem=memused; + } + ProgramChecker(): memused(0), maxmem(0) { + start_time=clock(); + assert(sizeof(U8)==1); + assert(sizeof(U16)==2); + assert(sizeof(U32)==4); + assert(sizeof(short)==2); + assert(sizeof(int)==4); + } + void print() const { // print time and memory used + printf("Time %1.2f sec, used %d bytes of memory\n", + double(clock()-start_time)/CLOCKS_PER_SEC, maxmem); + } +} programChecker; + +//////////////////////////// Array //////////////////////////// + +// Array a(n); creates n elements of T initialized to 0 bits. +// Constructors for T are not called. +// Indexing is bounds checked if assertions are on. +// a.size() returns n. +// a.resize(n) changes size to n, padding with 0 bits or truncating. +// a.push_back(x) appends x and increases size by 1, reserving up to size*2. +// a.pop_back() decreases size by 1, does not free memory. +// Copy and assignment are not supported. +// Memory is aligned on a ALIGN byte boundary (power of 2), default is none. + +template class Array { +private: + int n; // user size + int reserved; // actual size + char *ptr; // allocated memory, zeroed + T* data; // start of n elements of aligned data + void create(int i); // create with size i +public: + explicit Array(int i=0) {create(i);} + ~Array(); + T& operator[](int i) { +#ifndef NDEBUG + if (i<0 || i>=n) fprintf(stderr, "%d out of bounds %d\n", i, n), quit(); +#endif + return data[i]; + } + const T& operator[](int i) const { +#ifndef NDEBUG + if (i<0 || i>=n) fprintf(stderr, "%d out of bounds %d\n", i, n), quit(); +#endif + return data[i]; + } + int size() const {return n;} + void resize(int i); // change size to i + void pop_back() {if (n>0) --n;} // decrement size + void push_back(const T& x); // increment size, append x +private: + Array(const Array&); // no copy or assignment + Array& operator=(const Array&); +}; + +template void Array::resize(int i) { + if (i<=reserved) { + n=i; + return; + } + char *saveptr=ptr; + T *savedata=data; + int saven=n; + create(i); + if (saveptr) { + if (savedata) { + memcpy(data, savedata, sizeof(T)*min(i, saven)); + programChecker.alloc(-ALIGN-n*sizeof(T)); + } + free(saveptr); + } +} + +template void Array::create(int i) { + n=reserved=i; + if (i<=0) { + data=0; + ptr=0; + return; + } + const int sz=ALIGN+n*sizeof(T); + programChecker.alloc(sz); + ptr = (char*)calloc(sz, 1); + if (!ptr) quit("Out of memory"); + data = (ALIGN ? (T*)(ptr+ALIGN-(((long)ptr)&(ALIGN-1))) : (T*)ptr); + assert((char*)data>=ptr && (char*)data<=ptr+ALIGN); +} + +template Array::~Array() { + programChecker.alloc(-ALIGN-n*sizeof(T)); + free(ptr); +} + +template void Array::push_back(const T& x) { + if (n==reserved) { + int saven=n; + resize(max(1, n*2)); + n=saven; + } + data[n++]=x; +} + +/////////////////////////// String ///////////////////////////// + +// A tiny subset of std::string +// size() includes NUL terminator. + +class String: public Array { +public: + const char* c_str() const {return &(*this)[0];} + void operator=(const char* s) { + resize(strlen(s)+1); + strcpy(&(*this)[0], s); + } + void operator+=(const char* s) { + assert(s); + pop_back(); + while (*s) push_back(*s++); + push_back(0); + } + String(const char* s=""): Array(1) { + (*this)+=s; + } +}; + + +//////////////////////////// rnd /////////////////////////////// + +// 32-bit pseudo random number generator +class Random{ + Array table; + int i; +public: + Random(): table(64) { + table[0]=123456789; + table[1]=987654321; + for(int j=0; j<62; j++) table[j+2]=table[j+1]*11+table[j]*23/16; + i=0; + } + U32 operator()() { + return ++i, table[i&63]=table[i-24&63]^table[i-55&63]; + } +} rnd; + +////////////////////////////// Buf ///////////////////////////// + +// Buf(n) buf; creates an array of n bytes (must be a power of 2). +// buf[i] returns a reference to the i'th byte with wrap (no out of bounds). +// buf(i) returns i'th byte back from pos (i > 0) +// buf.size() returns n. + +int pos; // Number of input bytes in buf (not wrapped) + +class Buf { + Array b; +public: + Buf(int i=0): b(i) {} + void setsize(int i) { + if (!i) return; + assert(i>0 && (i&(i-1))==0); + b.resize(i); + } + U8& operator[](int i) { + return b[i&b.size()-1]; + } + int operator()(int i) const { + assert(i>0); + return b[pos-i&b.size()-1]; + } + int size() const { + return b.size(); + } +}; + +/////////////////////// Global context ///////////////////////// + +int level=DEFAULT_OPTION; // Compression level 0 to 9 +#define MEM (0x10000< t; +public: + int operator()(U16 x) const {return t[x];} + Ilog(); +} ilog; + +// Compute lookup table by numerical integration of 1/x +Ilog::Ilog(): t(65536) { + U32 x=14155776; + for (int i=2; i<65536; ++i) { + x+=774541002/(i*2-1); // numerator is 2^29/ln 2 + t[i]=x>>24; + } +} + +// llog(x) accepts 32 bits +inline int llog(U32 x) { + if (x>=0x1000000) + return 256+ilog(x>>16); + else if (x>=0x10000) + return 128+ilog(x>>8); + else + return ilog(x); +} + +///////////////////////// state table //////////////////////// + +// State table: +// nex(state, 0) = next state if bit y is 0, 0 <= state < 256 +// nex(state, 1) = next state if bit y is 1 +// nex(state, 2) = number of zeros in bit history represented by state +// nex(state, 3) = number of ones represented +// +// States represent a bit history within some context. +// State 0 is the starting state (no bits seen). +// States 1-30 represent all possible sequences of 1-4 bits. +// States 31-252 represent a pair of counts, (n0,n1), the number +// of 0 and 1 bits respectively. If n0+n1 < 16 then there are +// two states for each pair, depending on if a 0 or 1 was the last +// bit seen. +// If n0 and n1 are too large, then there is no state to represent this +// pair, so another state with about the same ratio of n0/n1 is substituted. +// Also, when a bit is observed and the count of the opposite bit is large, +// then part of this count is discarded to favor newer data over old. + +#if 1 // change to #if 0 to generate this table at run time (4% slower) +static const U8 State_table[256][4]={ + { 1, 2, 0, 0},{ 3, 5, 1, 0},{ 4, 6, 0, 1},{ 7, 10, 2, 0}, // 0-3 + { 8, 12, 1, 1},{ 9, 13, 1, 1},{ 11, 14, 0, 2},{ 15, 19, 3, 0}, // 4-7 + { 16, 23, 2, 1},{ 17, 24, 2, 1},{ 18, 25, 2, 1},{ 20, 27, 1, 2}, // 8-11 + { 21, 28, 1, 2},{ 22, 29, 1, 2},{ 26, 30, 0, 3},{ 31, 33, 4, 0}, // 12-15 + { 32, 35, 3, 1},{ 32, 35, 3, 1},{ 32, 35, 3, 1},{ 32, 35, 3, 1}, // 16-19 + { 34, 37, 2, 2},{ 34, 37, 2, 2},{ 34, 37, 2, 2},{ 34, 37, 2, 2}, // 20-23 + { 34, 37, 2, 2},{ 34, 37, 2, 2},{ 36, 39, 1, 3},{ 36, 39, 1, 3}, // 24-27 + { 36, 39, 1, 3},{ 36, 39, 1, 3},{ 38, 40, 0, 4},{ 41, 43, 5, 0}, // 28-31 + { 42, 45, 4, 1},{ 42, 45, 4, 1},{ 44, 47, 3, 2},{ 44, 47, 3, 2}, // 32-35 + { 46, 49, 2, 3},{ 46, 49, 2, 3},{ 48, 51, 1, 4},{ 48, 51, 1, 4}, // 36-39 + { 50, 52, 0, 5},{ 53, 43, 6, 0},{ 54, 57, 5, 1},{ 54, 57, 5, 1}, // 40-43 + { 56, 59, 4, 2},{ 56, 59, 4, 2},{ 58, 61, 3, 3},{ 58, 61, 3, 3}, // 44-47 + { 60, 63, 2, 4},{ 60, 63, 2, 4},{ 62, 65, 1, 5},{ 62, 65, 1, 5}, // 48-51 + { 50, 66, 0, 6},{ 67, 55, 7, 0},{ 68, 57, 6, 1},{ 68, 57, 6, 1}, // 52-55 + { 70, 73, 5, 2},{ 70, 73, 5, 2},{ 72, 75, 4, 3},{ 72, 75, 4, 3}, // 56-59 + { 74, 77, 3, 4},{ 74, 77, 3, 4},{ 76, 79, 2, 5},{ 76, 79, 2, 5}, // 60-63 + { 62, 81, 1, 6},{ 62, 81, 1, 6},{ 64, 82, 0, 7},{ 83, 69, 8, 0}, // 64-67 + { 84, 71, 7, 1},{ 84, 71, 7, 1},{ 86, 73, 6, 2},{ 86, 73, 6, 2}, // 68-71 + { 44, 59, 5, 3},{ 44, 59, 5, 3},{ 58, 61, 4, 4},{ 58, 61, 4, 4}, // 72-75 + { 60, 49, 3, 5},{ 60, 49, 3, 5},{ 76, 89, 2, 6},{ 76, 89, 2, 6}, // 76-79 + { 78, 91, 1, 7},{ 78, 91, 1, 7},{ 80, 92, 0, 8},{ 93, 69, 9, 0}, // 80-83 + { 94, 87, 8, 1},{ 94, 87, 8, 1},{ 96, 45, 7, 2},{ 96, 45, 7, 2}, // 84-87 + { 48, 99, 2, 7},{ 48, 99, 2, 7},{ 88,101, 1, 8},{ 88,101, 1, 8}, // 88-91 + { 80,102, 0, 9},{103, 69,10, 0},{104, 87, 9, 1},{104, 87, 9, 1}, // 92-95 + {106, 57, 8, 2},{106, 57, 8, 2},{ 62,109, 2, 8},{ 62,109, 2, 8}, // 96-99 + { 88,111, 1, 9},{ 88,111, 1, 9},{ 80,112, 0,10},{113, 85,11, 0}, // 100-103 + {114, 87,10, 1},{114, 87,10, 1},{116, 57, 9, 2},{116, 57, 9, 2}, // 104-107 + { 62,119, 2, 9},{ 62,119, 2, 9},{ 88,121, 1,10},{ 88,121, 1,10}, // 108-111 + { 90,122, 0,11},{123, 85,12, 0},{124, 97,11, 1},{124, 97,11, 1}, // 112-115 + {126, 57,10, 2},{126, 57,10, 2},{ 62,129, 2,10},{ 62,129, 2,10}, // 116-119 + { 98,131, 1,11},{ 98,131, 1,11},{ 90,132, 0,12},{133, 85,13, 0}, // 120-123 + {134, 97,12, 1},{134, 97,12, 1},{136, 57,11, 2},{136, 57,11, 2}, // 124-127 + { 62,139, 2,11},{ 62,139, 2,11},{ 98,141, 1,12},{ 98,141, 1,12}, // 128-131 + { 90,142, 0,13},{143, 95,14, 0},{144, 97,13, 1},{144, 97,13, 1}, // 132-135 + { 68, 57,12, 2},{ 68, 57,12, 2},{ 62, 81, 2,12},{ 62, 81, 2,12}, // 136-139 + { 98,147, 1,13},{ 98,147, 1,13},{100,148, 0,14},{149, 95,15, 0}, // 140-143 + {150,107,14, 1},{150,107,14, 1},{108,151, 1,14},{108,151, 1,14}, // 144-147 + {100,152, 0,15},{153, 95,16, 0},{154,107,15, 1},{108,155, 1,15}, // 148-151 + {100,156, 0,16},{157, 95,17, 0},{158,107,16, 1},{108,159, 1,16}, // 152-155 + {100,160, 0,17},{161,105,18, 0},{162,107,17, 1},{108,163, 1,17}, // 156-159 + {110,164, 0,18},{165,105,19, 0},{166,117,18, 1},{118,167, 1,18}, // 160-163 + {110,168, 0,19},{169,105,20, 0},{170,117,19, 1},{118,171, 1,19}, // 164-167 + {110,172, 0,20},{173,105,21, 0},{174,117,20, 1},{118,175, 1,20}, // 168-171 + {110,176, 0,21},{177,105,22, 0},{178,117,21, 1},{118,179, 1,21}, // 172-175 + {110,180, 0,22},{181,115,23, 0},{182,117,22, 1},{118,183, 1,22}, // 176-179 + {120,184, 0,23},{185,115,24, 0},{186,127,23, 1},{128,187, 1,23}, // 180-183 + {120,188, 0,24},{189,115,25, 0},{190,127,24, 1},{128,191, 1,24}, // 184-187 + {120,192, 0,25},{193,115,26, 0},{194,127,25, 1},{128,195, 1,25}, // 188-191 + {120,196, 0,26},{197,115,27, 0},{198,127,26, 1},{128,199, 1,26}, // 192-195 + {120,200, 0,27},{201,115,28, 0},{202,127,27, 1},{128,203, 1,27}, // 196-199 + {120,204, 0,28},{205,115,29, 0},{206,127,28, 1},{128,207, 1,28}, // 200-203 + {120,208, 0,29},{209,125,30, 0},{210,127,29, 1},{128,211, 1,29}, // 204-207 + {130,212, 0,30},{213,125,31, 0},{214,137,30, 1},{138,215, 1,30}, // 208-211 + {130,216, 0,31},{217,125,32, 0},{218,137,31, 1},{138,219, 1,31}, // 212-215 + {130,220, 0,32},{221,125,33, 0},{222,137,32, 1},{138,223, 1,32}, // 216-219 + {130,224, 0,33},{225,125,34, 0},{226,137,33, 1},{138,227, 1,33}, // 220-223 + {130,228, 0,34},{229,125,35, 0},{230,137,34, 1},{138,231, 1,34}, // 224-227 + {130,232, 0,35},{233,125,36, 0},{234,137,35, 1},{138,235, 1,35}, // 228-231 + {130,236, 0,36},{237,125,37, 0},{238,137,36, 1},{138,239, 1,36}, // 232-235 + {130,240, 0,37},{241,125,38, 0},{242,137,37, 1},{138,243, 1,37}, // 236-239 + {130,244, 0,38},{245,135,39, 0},{246,137,38, 1},{138,247, 1,38}, // 240-243 + {140,248, 0,39},{249,135,40, 0},{250, 69,39, 1},{ 80,251, 1,39}, // 244-247 + {140,252, 0,40},{249,135,41, 0},{250, 69,40, 1},{ 80,251, 1,40}, // 248-251 + {140,252, 0,41}}; // 252, 253-255 are reserved + +#define nex(state,sel) State_table[state][sel] + +// The code used to generate the above table at run time (4% slower). +// To print the table, uncomment the 4 lines of print statements below. +// In this code x,y = n0,n1 is the number of 0,1 bits represented by a state. +#else + +class StateTable { + Array ns; // state*4 -> next state if 0, if 1, n0, n1 + enum {B=5, N=64}; // sizes of b, t + static const int b[B]; // x -> max y, y -> max x + static U8 t[N][N][2]; // x,y -> state number, number of states + int num_states(int x, int y); // compute t[x][y][1] + void discount(int& x); // set new value of x after 1 or y after 0 + void next_state(int& x, int& y, int b); // new (x,y) after bit b +public: + int operator()(int state, int sel) {return ns[state*4+sel];} + StateTable(); +} nex; + +const int StateTable::b[B]={42,41,13,6,5}; // x -> max y, y -> max x +U8 StateTable::t[N][N][2]; + +int StateTable::num_states(int x, int y) { + if (x=N || y>=N || y>=B || x>=b[y]) return 0; + + // States 0-30 are a history of the last 0-4 bits + if (x+y<=4) { // x+y choose x = (x+y)!/x!y! + int r=1; + for (int i=x+1; i<=x+y; ++i) r*=i; + for (int i=2; i<=y; ++i) r/=i; + return r; + } + + // States 31-255 represent a 0,1 count and possibly the last bit + // if the state is reachable by either a 0 or 1. + else + return 1+(y>0 && x+y<16); +} + +// New value of count x if the opposite bit is observed +void StateTable::discount(int& x) { + if (x>2) x=ilog(x)/6-1; +} + +// compute next x,y (0 to N) given input b (0 or 1) +void StateTable::next_state(int& x, int& y, int b) { + if (x next if 0, next if 1, x, y +StateTable::StateTable(): ns(1024) { + + // Assign states + int state=0; + for (int i=0; i<256; ++i) { + for (int y=0; y<=i; ++y) { + int x=i-y; + int n=num_states(x, y); + if (n) { + t[x][y][0]=state; + t[x][y][1]=n; + state+=n; + } + } + } + + // Print/generate next state table + state=0; + for (int i=0; i0) ns1+=t[x-1][y+1][1]; + ns[state*4]=ns0; + ns[state*4+1]=ns1; + ns[state*4+2]=x; + ns[state*4+3]=y; + } + else if (t[x][y][1]) { + next_state(x0, y0, 0); + next_state(x1, y1, 1); + ns[state*4]=ns0=t[x0][y0][0]; + ns[state*4+1]=ns1=t[x1][y1][0]+(t[x1][y1][1]>1); + ns[state*4+2]=x; + ns[state*4+3]=y; + } + // uncomment to print table above +// printf("{%3d,%3d,%2d,%2d},", ns[state*4], ns[state*4+1], +// ns[state*4+2], ns[state*4+3]); +// if (state%4==3) printf(" // %d-%d\n ", state-3, state); + assert(state>=0 && state<256); + assert(t[x][y][1]>0); + assert(t[x][y][0]<=state); + assert(t[x][y][0]+t[x][y][1]>state); + assert(t[x][y][1]<=6); + assert(t[x0][y0][1]>0); + assert(t[x1][y1][1]>0); + assert(ns0-t[x0][y0][0]=0); + assert(ns1-t[x1][y1][0]=0); + ++state; + } + } + } +// printf("%d states\n", state); exit(0); // uncomment to print table above +} + +#endif + +///////////////////////////// Squash ////////////////////////////// + +// return p = 1/(1 + exp(-d)), d scaled by 8 bits, p scaled by 12 bits +int squash(int d) { + static const int t[33]={ + 1,2,3,6,10,16,27,45,73,120,194,310,488,747,1101, + 1546,2047,2549,2994,3348,3607,3785,3901,3975,4022, + 4050,4068,4079,4085,4089,4092,4093,4094}; + if (d>2047) return 4095; + if (d<-2047) return 0; + int w=d&127; + d=(d>>7)+16; + return (t[d]*(128-w)+t[(d+1)]*w+64) >> 7; +} + +//////////////////////////// Stretch /////////////////////////////// + +// Inverse of squash. d = ln(p/(1-p)), d scaled by 8 bits, p by 12 bits. +// d has range -2047 to 2047 representing -8 to 8. p has range 0 to 4095. + +class Stretch { + Array t; +public: + Stretch(); + int operator()(int p) const { + assert(p>=0 && p<4096); + return t[p]; + } +} stretch; + +Stretch::Stretch(): t(4096) { + int pi=0; + for (int x=-2047; x<=2047; ++x) { // invert squash() + int i=squash(x); + for (int j=pi; j<=i; ++j) + t[j]=x; + pi=i+1; + } + t[4095]=2047; +} + +//////////////////////////// Mixer ///////////////////////////// + +// Mixer m(N, M, S=1, w=0) combines models using M neural networks with +// N inputs each, of which up to S may be selected. If S > 1 then +// the outputs of these neural networks are combined using another +// neural network (with parameters S, 1, 1). If S = 1 then the +// output is direct. The weights are initially w (+-32K). +// It is used as follows: +// m.update() trains the network where the expected output is the +// last bit (in the global variable y). +// m.add(stretch(p)) inputs prediction from one of N models. The +// prediction should be positive to predict a 1 bit, negative for 0, +// nominally +-256 to +-2K. The maximum allowed value is +-32K but +// using such large values may cause overflow if N is large. +// m.set(cxt, range) selects cxt as one of 'range' neural networks to +// use. 0 <= cxt < range. Should be called up to S times such +// that the total of the ranges is <= M. +// m.p() returns the output prediction that the next bit is 1 as a +// 12 bit number (0 to 4095). + +// dot_product returns dot product t*w of n elements. n is rounded +// up to a multiple of 8. Result is scaled down by 8 bits. +#ifdef NOASM // no assembly language +int dot_product(short *t, short *w, int n) { + int sum=0; + n=(n+7)&-8; + for (int i=0; i> 8; + return sum; +} +#else // The NASM version uses MMX and is about 8 times faster. +extern "C" int dot_product(short *t, short *w, int n); // in NASM +#endif + +// Train neural network weights w[n] given inputs t[n] and err. +// w[i] += t[i]*err, i=0..n-1. t, w, err are signed 16 bits (+- 32K). +// err is scaled 16 bits (representing +- 1/2). w[i] is clamped to +- 32K +// and rounded. n is rounded up to a multiple of 8. +#ifdef NOASM +void train(short *t, short *w, int n, int err) { + n=(n+7)&-8; + for (int i=0; i>16)+1>>1); + if (wt<-32768) wt=-32768; + if (wt>32767) wt=32767; + w[i]=wt; + } +} +#else +extern "C" void train(short *t, short *w, int n, int err); // in NASM +#endif + +class Mixer { + const int N, M, S; // max inputs, max contexts, max context sets + Array tx; // N inputs from add() + Array wx; // N*M weights + Array cxt; // S contexts + int ncxt; // number of contexts (0 to S) + int base; // offset of next context + int nx; // Number of inputs in tx, 0 to N + Array pr; // last result (scaled 12 bits) + Mixer* mp; // points to a Mixer to combine results +public: + Mixer(int n, int m, int s=1, int w=0); + + // Adjust weights to minimize coding cost of last prediction + void update() { + for (int i=0; i=-32768 && err<32768); + train(&tx[0], &wx[cxt[i]*N], nx, err); + } + nx=base=ncxt=0; + } + + // Input x (call up to N times) + void add(int x) { + assert(nx=0); + assert(ncxt=0); + assert(base+cxupdate(); + for (int i=0; i>5); + mp->add(stretch(pr[i])); + } + mp->set(0, 1); + return mp->p(); + } + else { // S=1 context + return pr[0]=squash(dot_product(&tx[0], &wx[0], nx)>>8); + } + } + ~Mixer(); +}; + +Mixer::~Mixer() { + delete mp; +} + + +Mixer::Mixer(int n, int m, int s, int w): + N((n+7)&-8), M(m), S(s), tx(N), wx(N*M), + cxt(S), ncxt(0), base(0), nx(0), pr(S), mp(0) { + assert(n>0 && N>0 && (N&7)==0 && M>0); + for (int i=0; i1) mp=new Mixer(S, 1, 1, 0x7fff); +} + +//////////////////////////// APM ////////////////////////////// + +// APM maps a probability and a context into a new probability +// that bit y will next be 1. After each guess it updates +// its state to improve future guesses. Methods: +// +// APM a(N) creates with N contexts, uses 66*N bytes memory. +// a.p(pr, cx, rate=7) returned adjusted probability in context cx (0 to +// N-1). rate determines the learning rate (smaller = faster, default 7). +// Probabilities are scaled 12 bits (0-4095). + +class APM { + int index; // last p, context + const int N; // number of contexts + Array t; // [N][33]: p, context -> p +public: + APM(int n); + int p(int pr=2048, int cxt=0, int rate=7) { + assert(pr>=0 && pr<4096 && cxt>=0 && cxt0 && rate<32); + pr=stretch(pr); + int g=(y<<16)+(y<> rate; + t[index+1] += g-t[index+1] >> rate; + const int w=pr&127; // interpolation weight (33 points) + index=(pr+2048>>7)+cxt*33; + return t[index]*(128-w)+t[index+1]*w >> 11; + } +}; + +// maps p, cxt -> p initially +APM::APM(int n): index(0), N(n), t(n*33) { + for (int i=0; i probability * 4096 +class StateMap { +protected: + int cxt; // context + Array t; // 256 states -> probability * 64K +public: + StateMap(); + int p(int cx) { + assert(cx>=0 && cx> 8; + return t[cxt=cx] >> 4; + } +}; + +StateMap::StateMap(): cxt(0), t(256) { + for (int i=0; i<256; ++i) { + int n0=nex(i,2); + int n1=nex(i,3); + if (n0==0) n1*=64; + if (n1==0) n0*=64; + t[i] = 65536*(n1+1)/(n0+n1+2); + } +} + +//////////////////////////// hash ////////////////////////////// + +// Hash 2-5 ints. +inline U32 hash(U32 a, U32 b, U32 c=0xffffffff, U32 d=0xffffffff, + U32 e=0xffffffff) { + U32 h=a*200002979u+b*30005491u+c*50004239u+d*70004807u+e*110002499u; + return h^h>>9^a>>2^b>>3^c>>4^d>>5^e>>6; +} + +///////////////////////////// BH //////////////////////////////// + +// A BH maps a 32 bit hash to an array of B bytes (checksum and B-2 values) +// +// BH bh(N); creates N element table with B bytes each. +// N must be a power of 2. The first byte of each element is +// reserved for a checksum to detect collisions. The remaining +// B-1 bytes are values, prioritized by the first value. This +// byte is 0 to mark an unused element. +// +// bh[i] returns a pointer to the i'th element, such that +// bh[i][0] is a checksum of i, bh[i][1] is the priority, and +// bh[i][2..B-1] are other values (0-255). +// The low lg(n) bits as an index into the table. +// If a collision is detected, up to M nearby locations in the same +// cache line are tested and the first matching checksum or +// empty element is returned. +// If no match or empty element is found, then the lowest priority +// element is replaced. + +// 2 byte checksum with LRU replacement (except last 2 by priority) +template class BH { + enum {M=8}; // search limit + Array t; // elements + U32 n; // size-1 +public: + BH(int i): t(i*B), n(i-1) { + assert(B>=2 && i>0 && (i&(i-1))==0); // size a power of 2? + } + U8* operator[](U32 i); +}; + +template +inline U8* BH::operator[](U32 i) { + int chk=(i>>16^i)&0xffff; + i=i*M&n; + U8 *p; + U16 *cp; + int j; + for (j=0; j2 && t[(i+j)*B+2]>t[(i+j-1)*B+2]) --j; + } + else memcpy(tmp, cp, B); + memmove(&t[(i+1)*B], &t[i*B], j*B); + memcpy(&t[i*B], tmp, B); + return &t[i*B+1]; +} + +/////////////////////////// ContextMap ///////////////////////// +// +// A ContextMap maps contexts to a bit histories and makes predictions +// to a Mixer. Methods common to all classes: +// +// ContextMap cm(M, C); creates using about M bytes of memory (a power +// of 2) for C contexts. +// cm.set(cx); sets the next context to cx, called up to C times +// cx is an arbitrary 32 bit value that identifies the context. +// It should be called before predicting the first bit of each byte. +// cm.mix(m) updates Mixer m with the next prediction. Returns 1 +// if context cx is found, else 0. Then it extends all the contexts with +// global bit y. It should be called for every bit: +// +// if (bpos==0) +// for (int i=0; i= 1. Context need not be hashed. + +// Predict to mixer m from bit history state s, using sm to map s to +// a probability. +inline int mix2(Mixer& m, int s, StateMap& sm) { + int p1=sm.p(s); + int n0=-!nex(s,2); + int n1=-!nex(s,3); + int st=stretch(p1)>>2; + m.add(st); + p1>>=4; + int p0=255-p1; + m.add(p1-p0); + m.add(st*(n1-n0)); + m.add((p1&n0)-(p0&n1)); + m.add((p1&n1)-(p0&n0)); + return s>0; +} + +// A RunContextMap maps a context into the next byte and a repeat +// count up to M. Size should be a power of 2. Memory usage is 3M/4. +class RunContextMap { + BH<4> t; + U8* cp; +public: + RunContextMap(int m): t(m/4) {cp=t[0]+1;} + void set(U32 cx) { // update count + if (cp[0]==0 || cp[1]!=buf(1)) cp[0]=1, cp[1]=buf(1); + else if (cp[0]<255) ++cp[0]; + cp=t[cx]+1; + } + int p() { // predict next bit + if (cp[1]+256>>8-bpos==c0) + return ((cp[1]>>7-bpos&1)*2-1)*ilog(cp[0]+1)*8; + else + return 0; + } + int mix(Mixer& m) { // return run length + m.add(p()); + return cp[0]!=0; + } +}; + +// Context is looked up directly. m=size is power of 2 in bytes. +// Context should be < m/512. High bits are discarded. +class SmallStationaryContextMap { + Array t; + int cxt; + U16 *cp; +public: + SmallStationaryContextMap(int m): t(m/2), cxt(0) { + assert((m/2&m/2-1)==0); // power of 2? + for (int i=0; i> rate; + cp=&t[cxt+c0]; + m.add(stretch(*cp>>4)); + } +}; + +// Context map for large contexts. Most modeling uses this type of context +// map. It includes a built in RunContextMap to predict the last byte seen +// in the same context, and also bit-level contexts that map to a bit +// history state. +// +// Bit histories are stored in a hash table. The table is organized into +// 64-byte buckets alinged on cache page boundaries. Each bucket contains +// a hash chain of 7 elements, plus a 2 element queue (packed into 1 byte) +// of the last 2 elements accessed for LRU replacement. Each element has +// a 2 byte checksum for detecting collisions, and an array of 7 bit history +// states indexed by the last 0 to 2 bits of context. The buckets are indexed +// by a context ending after 0, 2, or 5 bits of the current byte. Thus, each +// byte modeled results in 3 main memory accesses per context, with all other +// accesses to cache. +// +// On bits 0, 2 and 5, the context is updated and a new bucket is selected. +// The most recently accessed element is tried first, by comparing the +// 16 bit checksum, then the 7 elements are searched linearly. If no match +// is found, then the element with the lowest priority among the 5 elements +// not in the LRU queue is replaced. After a replacement, the queue is +// emptied (so that consecutive misses favor a LFU replacement policy). +// In all cases, the found/replaced element is put in the front of the queue. +// +// The priority is the state number of the first element (the one with 0 +// additional bits of context). The states are sorted by increasing n0+n1 +// (number of bits seen), implementing a LFU replacement policy. +// +// When the context ends on a byte boundary (bit 0), only 3 of the 7 bit +// history states are used. The remaining 4 bytes implement a run model +// as follows: where is the last byte +// seen, possibly repeated. is a 7 bit count and a 1 bit +// flag (represented by count * 2 + d). If d=0 then = 1..127 is the +// number of repeats of and no other bytes have been seen. If d is 1 then +// other byte values have been seen in this context prior to the last +// copies of . +// +// As an optimization, the last two hash elements of each byte (representing +// contexts with 2-7 bits) are not updated until a context is seen for +// a second time. This is indicated by = <1,0> (2). After update, +// is updated to <2,0> or <1,1> (4 or 3). + +class ContextMap { + const int C; // max number of contexts + class E { // hash element, 64 bytes + U16 chk[7]; // byte context checksums + U8 last; // last 2 accesses (0-6) in low, high nibble + public: + U8 bh[7][7]; // byte context, 3-bit context -> bit history state + // bh[][0] = 1st bit, bh[][1,2] = 2nd bit, bh[][3..6] = 3rd bit + // bh[][0] is also a replacement priority, 0 = empty + U8* get(U16 chk); // Find element (0-6) matching checksum. + // If not found, insert or replace lowest priority (not last). + }; + Array t; // bit histories for bits 0-1, 2-4, 5-7 + // For 0-1, also contains a run count in bh[][4] and value in bh[][5] + // and pending update count in bh[7] + Array cp; // C pointers to current bit history + Array cp0; // First element of 7 element array containing cp[i] + Array cxt; // C whole byte contexts (hashes) + Array runp; // C [0..3] = count, value, unused, unused + StateMap *sm; // C maps of state -> p + int cn; // Next context to set by set() + void update(U32 cx, int c); // train model that context cx predicts c + int mix1(Mixer& m, int cc, int bp, int c1, int y1); + // mix() with global context passed as arguments to improve speed. +public: + ContextMap(int m, int c=1); // m = memory in bytes, a power of 2, C = c + ~ContextMap(); + void set(U32 cx, int next=-1); // set next whole byte context to cx + // if next is 0 then set order does not matter + int mix(Mixer& m) {return mix1(m, c0, bpos, buf(1), y);} +}; + +// Find or create hash element matching checksum ch +inline U8* ContextMap::E::get(U16 ch) { + if (chk[last&15]==ch) return &bh[last&15][0]; + int b=0xffff, bi=0; + for (int i=0; i<7; ++i) { + if (chk[i]==ch) return last=last<<4|i, &bh[i][0]; + int pri=bh[i][0]; + if ((last&15)!=i && last>>4!=i && pri>6), cp(c), cp0(c), + cxt(c), runp(c), cn(0) { + assert(m>=64 && (m&m-1)==0); // power of 2? + assert(sizeof(E)==64); + sm=new StateMap[C]; + for (int i=0; i=0 && i>16; + cxt[i]=cx*123456791+i; +} + +// Update the model with bit y1, and predict next bit to mixer m. +// Context: cc=c0, bp=bpos, c1=buf(1), y1=y. +int ContextMap::mix1(Mixer& m, int cc, int bp, int c1, int y1) { + + // Update model with y + int result=0; + for (int i=0; i=&t[0].bh[0][0] && cp[i]<=&t[t.size()-1].bh[6][6]); + assert((long(cp[i])&63)>=15); + int ns=nex(*cp[i], y1); + if (ns>=204 && rnd() << (452-ns>>3)) ns-=4; // probabilistic increment + *cp[i]=ns; + } + + // Update context pointers + if (bpos>1 && runp[i][0]==0) + cp[i]=0; + else if (bpos==1||bpos==3||bpos==6) + cp[i]=cp0[i]+1+(cc&1); + else if (bpos==4||bpos==7) + cp[i]=cp0[i]+3+(cc&3); + else { + cp0[i]=cp[i]=t[cxt[i]+cc&t.size()-1].get(cxt[i]>>16); + + // Update pending bit histories for bits 2-7 + if (bpos==0) { + if (cp0[i][3]==2) { + const int c=cp0[i][4]+256; + U8 *p=t[cxt[i]+(c>>6)&t.size()-1].get(cxt[i]>>16); + p[0]=1+((c>>5)&1); + p[1+((c>>5)&1)]=1+((c>>4)&1); + p[3+((c>>4)&3)]=1+((c>>3)&1); + p=t[cxt[i]+(c>>3)&t.size()-1].get(cxt[i]>>16); + p[0]=1+((c>>2)&1); + p[1+((c>>2)&1)]=1+((c>>1)&1); + p[3+((c>>1)&3)]=1+(c&1); + cp0[i][6]=0; + } + // Update run count of previous context + if (runp[i][0]==0) // new context + runp[i][0]=2, runp[i][1]=c1; + else if (runp[i][1]!=c1) // different byte in context + runp[i][0]=1, runp[i][1]=c1; + else if (runp[i][0]<254) // same byte in context + runp[i][0]+=2; + else if (runp[i][0]==255) + runp[i][0]=128; + runp[i]=cp0[i]+3; + } + } + + // predict from last byte in context + int rc=runp[i][0]; // count*2, +1 if 2 different bytes seen + if (runp[i][1]+256>>8-bp==cc) { + int b=(runp[i][1]>>7-bp&1)*2-1; // predicted bit + for 1, - for 0 + int c=ilog(rc+1)<<2+(~rc&1); + m.add(b*c); + } + else + m.add(0); + + // predict from bit context + result+=mix2(m, cp[i] ? *cp[i] : 0, sm[i]); + } + if (bp==7) cn=0; + return result; +} + +//////////////////////////// Models ////////////////////////////// + +// All of the models below take a Mixer as a parameter and write +// predictions to it. + +//////////////////////////// matchModel /////////////////////////// + +// matchModel() finds the longest matching context and returns its length + +int matchModel(Mixer& m) { + const int MAXLEN=65534; // longest allowed match + 1 + static Array t(MEM); // hash table of pointers to contexts + static int h=0; // hash of last 7 bytes + static int ptr=0; // points to next byte of match if any + static int len=0; // length of match, or 0 if no match + static int result=0; + + static SmallStationaryContextMap scm1(0x20000); + + if (!bpos) { + h=h*997*8+buf(1)+1&t.size()-1; // update context hash + if (len) ++len, ++ptr; + else { // find match + ptr=t[h]; + if (ptr && pos-ptr0 && !(result&0xfff)) printf("pos=%d len=%d ptr=%d\n", pos, len, ptr); + scm1.set(pos); + } + + // predict + if (len>MAXLEN) len=MAXLEN; + int sgn; + if (len && buf(1)==buf[ptr-1] && c0==buf[ptr]+256>>8-bpos) { + if (buf[ptr]>>7-bpos&1) sgn=1; + else sgn=-1; + } + else sgn=len=0; + m.add(sgn*4*ilog(len)); + m.add(sgn*64*min(len, 32)); + scm1.mix(m); + return result; +} + +//////////////////////////// picModel ////////////////////////// + +// Model a 1728 by 2376 2-color CCITT bitmap image, left to right scan, +// MSB first (216 bytes per row, 513216 bytes total). Insert predictions +// into m. + +void picModel(Mixer& m) { + static U32 r0, r1, r2, r3; // last 4 rows, bit 8 is over current pixel + static Array t(0x10200); // model: cxt -> state + const int N=3; // number of contexts + static int cxt[N]; // contexts + static StateMap sm[N]; + + // update the model + for (int i=0; i>(7-bpos))&1); + r2+=r2+((buf(431)>>(7-bpos))&1); + r3+=r3+((buf(647)>>(7-bpos))&1); + cxt[0]=r0&0x7|r1>>4&0x38|r2>>3&0xc0; + cxt[1]=0x100+(r0&1|r1>>4&0x3e|r2>>2&0x40|r3>>1&0x80); + cxt[2]=0x200+(r0&0x3f^r1&0x3ffe^r2<<2&0x7f00^r3<<5&0xf800); + + // predict + for (int i=0; i='A' && c<='Z') + c+='a'-'A'; + if (c>='a' && c<='z' || c>=128) { + word0=word0*263*32+c; + text0=text0*997*16+c; + } + else if (word0) { + word5=word4*23; + word4=word3*19; + word3=word2*17; + word2=word1*13; + word1=word0*11; + word0=0; + } + if (c==10) nl1=nl, nl=pos-1; + int col=min(255, pos-nl), above=buf[nl1+col]; // text column context + U32 h=word0*271+buf(1); + + cm.set(h); + cm.set(word0); + cm.set(h+word1); + cm.set(word0+word1*31); + cm.set(h+word1+word2*29); + cm.set(text0&0xffffff); + cm.set(text0&0xfffff); + + cm.set(h+word2); + cm.set(h+word3); + cm.set(h+word4); + cm.set(h+word5); + cm.set(buf(1)|buf(3)<<8|buf(5)<<16); + cm.set(buf(2)|buf(4)<<8|buf(6)<<16); + + cm.set(h+word1+word3); + cm.set(h+word2+word3); + + // Text column models + cm.set(col<<16|buf(1)<<8|above); + cm.set(buf(1)<<8|above); + cm.set(col<<8|buf(1)); + cm.set(col); + } + cm.mix(m); +} + +//////////////////////////// recordModel /////////////////////// + +// Model 2-D data with fixed record length. Also order 1-2 models +// that include the distance to the last match. + +void recordModel(Mixer& m) { + static int cpos1[256] , cpos2[256], cpos3[256], cpos4[256]; + static int wpos1[0x10000]; // buf(1..2) -> last position + static int rlen=2, rlen1=3, rlen2=4; // run length and 2 candidates + static int rcount1=0, rcount2=0; // candidate counts + static ContextMap cm(32768, 3), cn(32768/2, 3), co(32768*2, 3), cp(MEM, 3); + + // Find record length + if (!bpos) { + int w=c4&0xffff, c=w&255, d=w>>8; +#if 1 + int r=pos-cpos1[c]; + if (r>1 && r==cpos1[c]-cpos2[c] + && r==cpos2[c]-cpos3[c] && r==cpos3[c]-cpos4[c] + && (r>15 || (c==buf(r*5+1)) && c==buf(r*6+1))) { + if (r==rlen1) ++rcount1; + else if (r==rlen2) ++rcount2; + else if (rcount1>rcount2) rlen2=r, rcount2=1; + else rlen1=r, rcount1=1; + } + if (rcount1>15 && rlen!=rlen1) rlen=rlen1, rcount1=rcount2=0; + if (rcount2>15 && rlen!=rlen2) rlen=rlen2, rcount1=rcount2=0; + + // Set 2 dimensional contexts + assert(rlen>0); +#endif + cm.set(c<<8| (min(255, pos-cpos1[c])/4) ); + cm.set(w<<9| llog(pos-wpos1[w])>>2); + + cm.set(rlen|buf(rlen)<<10|buf(rlen*2)<<18); + cn.set(w|rlen<<8); + cn.set(d|rlen<<16); + cn.set(c|rlen<<8); + + co.set(buf(1)<<8|min(255, pos-cpos1[buf(1)])); + co.set(buf(1)<<17|buf(2)<<9|llog(pos-wpos1[w])>>2); + int col=pos%rlen; + co.set(buf(1)<<8|buf(rlen)); + + //cp.set(w*16); + //cp.set(d*32); + //cp.set(c*64); + cp.set(rlen|buf(rlen)<<10|col<<18); + cp.set(rlen|buf(1)<<10|col<<18); + cp.set(col|rlen<<12); + + // update last context positions + cpos4[c]=cpos3[c]; + cpos3[c]=cpos2[c]; + cpos2[c]=cpos1[c]; + cpos1[c]=pos; + wpos1[w]=pos; + } + cm.mix(m); + cn.mix(m); + co.mix(m); + cp.mix(m); +} + + +//////////////////////////// sparseModel /////////////////////// + +// Model order 1-2 contexts with gaps. + +void sparseModel(Mixer& m, int seenbefore, int howmany) { + static ContextMap cm(MEM*2, 48); + static int mask = 0; + + if (bpos==0) { + + cm.set( c4&0x00f0f0f0); + cm.set((c4&0xf0f0f0f0)+1); + cm.set((c4&0x00f8f8f8)+2); + cm.set((c4&0xf8f8f8f8)+3); + cm.set((c4&0x00e0e0e0)+4); + cm.set((c4&0xe0e0e0e0)+5); + cm.set((c4&0x00f0f0ff)+6); + + cm.set(seenbefore); + cm.set(howmany); + cm.set(c4&0x00ff00ff); + cm.set(c4&0xff0000ff); + cm.set(buf(1)|buf(5)<<8); + cm.set(buf(1)|buf(6)<<8); + cm.set(buf(3)|buf(6)<<8); + cm.set(buf(4)|buf(8)<<8); + + for (int i=1; i<8; ++i) { + cm.set((buf(i+1)<<8)|buf(i+2)); + cm.set((buf(i+1)<<8)|buf(i+3)); + cm.set(seenbefore|buf(i)<<8); + } + + int fl = 0; + if( c4&0xff != 0 ){ + if( isalpha( c4&0xff ) ) fl = 1; + else if( ispunct( c4&0xff ) ) fl = 2; + else if( isspace( c4&0xff ) ) fl = 3; + else if( c4&0xff == 0xff ) fl = 4; + else if( c4&0xff < 16 ) fl = 5; + else if( c4&0xff < 64 ) fl = 6; + else fl = 7; + } + mask = (mask<<3)|fl; + cm.set(mask); + cm.set(mask<<8|buf(1)); + cm.set(mask<<17|buf(2)<<8|buf(3)); + cm.set(mask&0x1ff|((c4&0xf0f0f0f0)<<9)); + } + cm.mix(m); +} + +//////////////////////////// distanceModel /////////////////////// + +// Model for modelling distances between symbols + +void distanceModel(Mixer& m) { + static ContextMap cr(MEM, 3); + if( bpos == 0 ){ + static int pos00=0,pos20=0,posnl=0; + int c=c4&0xff; + if(c==0x00)pos00=pos; + if(c==0x20)pos20=pos; + if(c==0xff||c=='\r'||c=='\n')posnl=pos; + cr.set(min(pos-pos00,255)|(c<<8)); + cr.set(min(pos-pos20,255)|(c<<8)); + cr.set(min(pos-posnl,255)|(c<<8)+234567); + } + cr.mix(m); +} + +//////////////////////////// bmpModel ///////////////////////////////// + +// Model a 24-bit color uncompressed .bmp or .tif file. Return +// width in pixels if an image file is detected, else 0. + +// 32-bit little endian number at buf(i)..buf(i-3) +inline U32 i4(int i) { + assert(i>3); + return buf(i)+256*buf(i-1)+65536*buf(i-2)+16777216*buf(i-3); +} + +// 16-bit +inline int i2(int i) { + assert(i>1); + return buf(i)+256*buf(i-1); +} + +// Square buf(i) +inline int sqrbuf(int i) { + assert(i>0); + return buf(i)*buf(i); +} + +int bmpModel(Mixer& m) { + static int w=0; // width of image in bytes (pixels * 3) + static int eoi=0; // end of image + static U32 tiff=0; // offset of tif header + const int SC=0x20000; + static SmallStationaryContextMap scm1(SC), scm2(SC), + scm3(SC), scm4(SC), scm5(SC), scm6(SC*2); + static ContextMap cm(MEM*4, 8); + + // Detect .bmp file header (24 bit color, not compressed) + if (!bpos && buf(54)=='B' && buf(53)=='M' + && i4(44)==54 && i4(40)==40 && i4(24)==0) { + w=(i4(36)+3&-4)*3; // image width + const int height=i4(32); + eoi=pos; + if (w<0x30000 && height<0x10000) { + eoi=pos+w*height; // image size in bytes + printf("BMP %dx%d ", w/3, height); + } + else + eoi=pos; + } + + // Detect .tif file header (24 bit color, not compressed). + // Parsing is crude, won't work with weird formats. + if (!bpos) { + if (c4==0x49492a00) tiff=pos; // Intel format only + if (pos-tiff==4 && c4!=0x08000000) tiff=0; // 8=normal offset to directory + if (tiff && pos-tiff==200) { // most of directory should be read by now + int dirsize=i2(pos-tiff-4); // number of 12-byte directory entries + w=0; + int bpp=0, compression=0, width=0, height=0; + for (int i=tiff+6; i0; i+=12) { + int tag=i2(pos-i); // 256=width, 257==height, 259: 1=no compression + // 277=3 samples/pixel + int tagfmt=i2(pos-i-2); // 3=short, 4=long + int taglen=i4(pos-i-4); // number of elements in tagval + int tagval=i4(pos-i-8); // 1 long, 1-2 short, or points to array + if ((tagfmt==3||tagfmt==4) && taglen==1) { + if (tag==256) width=tagval; + if (tag==257) height=tagval; + if (tag==259) compression=tagval; // 1 = no compression + if (tag==277) bpp=tagval; // should be 3 + } + } + if (width>0 && height>0 && width*height>50 && compression==1 + && (bpp==1||bpp==3)) + eoi=tiff+width*height*bpp, w=width*bpp; + if (eoi>pos) + printf("TIFF %dx%dx%d ", width, height, bpp); + else + tiff=w=0; + } + } + if (pos>eoi) return w=0; + + // Select nearby pixels as context + if (!bpos) { + assert(w>3); + int color=pos%3; + int mean=buf(3)+buf(w-3)+buf(w)+buf(w+3); + const int var=sqrbuf(3)+sqrbuf(w-3)+sqrbuf(w)+sqrbuf(w+3)-mean*mean/4>>2; + mean>>=2; + const int logvar=ilog(var); + int i=0; + cm.set(hash(++i, buf(3)>>2, buf(w)>>2, color)); + cm.set(hash(++i, buf(3)>>2, buf(1)>>2, color)); + cm.set(hash(++i, buf(3)>>2, buf(2)>>2, color)); + cm.set(hash(++i, buf(w)>>2, buf(1)>>2, color)); + cm.set(hash(++i, buf(w)>>2, buf(2)>>2, color)); + cm.set(hash(++i, buf(3)+buf(w)>>1, color)); + cm.set(hash(++i, buf(3)+buf(w)>>3, buf(1)>>5, buf(2)>>5, color)); + cm.set(hash(++i, mean, logvar>>5, color)); + scm1.set(buf(3)+buf(w)>>1); + scm2.set(buf(3)+buf(w)-buf(w+3)>>1); + scm3.set(buf(3)*2-buf(6)>>1); + scm4.set(buf(w)*2-buf(w*2)>>1); + scm5.set(buf(3)+buf(w)-buf(w-3)>>1); + scm6.set(mean>>1|logvar<<1&0x180); + } + + // Predict next bit + scm1.mix(m); + scm2.mix(m); + scm3.mix(m); + scm4.mix(m); + scm5.mix(m); + scm6.mix(m); + cm.mix(m); + return w; +} + +//////////////////////////// jpegModel ///////////////////////// + +// Model JPEG. Return 1 if a JPEG file is detected or else 0. +// Only the baseline and 8 bit extended Huffman coded DCT modes are +// supported. The model partially decodes the JPEG image to provide +// context for the Huffman coded symbols. + +// Print a JPEG segment at buf[p...] for debugging +void dump(const char* msg, int p) { + printf("%s:", msg); + int len=buf[p+2]*256+buf[p+3]; + for (int i=0; i ht(8); // pointers to Huffman table headers + static int htsize=0; // number of pointers in ht + + // Huffman decode state + static U32 huffcode=0; // Current Huffman code including extra bits + static int huffbits=0; // Number of valid bits in huffcode + static int huffsize=0; // Number of bits without extra bits + static int rs=-1; // Decoded huffcode without extra bits. It represents + // 2 packed 4-bit numbers, r=run of zeros, s=number of extra bits for + // first nonzero code. huffcode is complete when rs >= 0. + // rs is -1 prior to decoding incomplete huffcode. + static int mcupos=0; // position in MCU (0-639). The low 6 bits mark + // the coefficient in zigzag scan order (0=DC, 1-63=AC). The high + // bits mark the block within the MCU, used to select Huffman tables. + + // Decoding tables + static Array huf(128); // Tc*64+Th*16+m -> min, max, val + static int mcusize=0; // number of coefficients in an MCU + static int linesize=0; // width of image in MCU + static int hufsel[2][10]; // DC/AC, mcupos/64 -> huf decode table + static Array hbuf(2048); // Tc*1024+Th*256+hufcode -> RS + + // Image state + static Array color(10); // block -> component (0-3) + static Array pred(4); // component -> last DC value + static int dc=0; // DC value of the current block + static int width=0; // Image width in MCU + static int row=0, column=0; // in MCU (column 0 to width-1) + static Buf cbuf(0x20000); // Rotating buffer of coefficients, coded as: + // DC: level shifted absolute value, low 4 bits discarded, i.e. + // [-1023...1024] -> [0...255]. + // AC: as an RS code: a run of R (0-15) zeros followed by an S (0-15) + // bit number, or 00 for end of block (in zigzag order). + // However if R=0, then the format is ssss11xx where ssss is S, + // xx is the first 2 extra bits, and the last 2 bits are 1 (since + // this never occurs in a valid RS code). + static int cpos=0; // position in cbuf + static U32 huff1=0, huff2=0, huff3=0, huff4=0; // hashes of last codes + static int rs1, rs2, rs3, rs4; // last 4 RS codes + static int ssum=0, ssum1=0, ssum2=0, ssum3=0, ssum4=0; + // sum of S in RS codes in block and last 4 values + + // Be sure to quit on a byte boundary + if (!bpos) next_jpeg=jpeg>1; + if (bpos && !jpeg) return next_jpeg; + if (!bpos && app>0) --app; + if (app>0) return next_jpeg; + if (!bpos) { + + // Parse. Baseline DCT-Huffman JPEG syntax is: + // SOI APPx... misc... SOF0 DHT... SOS data EOI + // SOI (= FF D8) start of image. + // APPx (= FF Ex) len ... where len is always a 2 byte big-endian length + // including the length itself but not the 2 byte preceding code. + // Application data is ignored. There may be more than one APPx. + // misc codes are DQT, DNL, DRI, COM (ignored). + // SOF0 (= FF C0) len 08 height width Nf [C HV Tq]... + // where len, height, width (in pixels) are 2 bytes, Nf is the repeat + // count (1 byte) of [C HV Tq], where C is a component identifier + // (color, 0-3), HV is the horizontal and vertical dimensions + // of the MCU (high, low bits, packed), and Tq is the quantization + // table ID (not used). An MCU (minimum compression unit) consists + // of 64*H*V DCT coefficients for each color. + // DHT (= FF C4) len [TcTh L1...L16 V1,1..V1,L1 ... V16,1..V16,L16]... + // defines Huffman table Th (1-4) for Tc (0=DC (first coefficient) + // 1=AC (next 63 coefficients)). L1..L16 are the number of codes + // of length 1-16 (in ascending order) and Vx,y are the 8-bit values. + // A V code of RS means a run of R (0-15) zeros followed by S (0-15) + // additional bits to specify the next nonzero value, negative if + // the first additional bit is 0 (e.g. code x63 followed by the + // 3 bits 1,0,1 specify 7 coefficients: 0, 0, 0, 0, 0, 0, 5. + // Code 00 means end of block (remainder of 63 AC coefficients is 0). + // SOS (= FF DA) len Ns [Cs TdTa]... 0 3F 00 + // Start of scan. TdTa specifies DC/AC Huffman tables (0-3, packed + // into one byte) for component Cs matching C in SOF0, repeated + // Ns (1-4) times. + // EOI (= FF D9) is end of image. + // Huffman coded data is between SOI and EOI. Codes may be embedded: + // RST0-RST7 (= FF D0 to FF D7) mark the start of an independently + // compressed region. + // DNL (= FF DC) 04 00 height + // might appear at the end of the scan (ignored). + // FF 00 is interpreted as FF (to distinguish from RSTx, DNL, EOI). + + // Detect JPEG (SOI, APPx) + if (!jpeg && buf(4)==FF && buf(3)==SOI && buf(2)==FF && buf(1)>>4==0xe) { + jpeg=1; + app=sos=sof=htsize=data=mcusize=linesize=0; + huffcode=huffbits=huffsize=mcupos=cpos=0, rs=-1; + memset(&huf[0], 0, huf.size()*sizeof(HUF)); + memset(&pred[0], 0, pred.size()*sizeof(int)); + } + + // Detect end of JPEG when data contains a marker other than RSTx + // or byte stuff (00). + if (jpeg && data && buf(2)==FF && buf(1) && (buf(1)&0xf8)!=RST0) { + jassert(buf(1)==EOI); + jpeg=0; + } + if (!jpeg) return next_jpeg; + + // Detect APPx or COM field + if (!data && !app && buf(4)==FF && (buf(3)>>4==0xe || buf(3)==COM)) + app=buf(2)*256+buf(1)+2; + + // Save pointers to sof, ht, sos, data, + if (buf(5)==FF && buf(4)==SOS) { + int len=buf(3)*256+buf(2); + if (len==6+2*buf(1) && buf(1) && buf(1)<=4) // buf(1) is Ns + sos=pos-5, data=sos+len+2, jpeg=2; + } + if (buf(4)==FF && buf(3)==DHT && htsize<8) ht[htsize++]=pos-4; + if (buf(4)==FF && buf(3)==SOF0) sof=pos-4; + + // Restart + if (buf(2)==FF && (buf(1)&0xf8)==RST0) { + huffcode=huffbits=huffsize=mcupos=0, rs=-1; + memset(&pred[0], 0, pred.size()*sizeof(int)); + } + } + + { + // Build Huffman tables + // huf[Tc][Th][m] = min, max+1 codes of length m, pointer to byte values + if (pos==data && bpos==1) { + jassert(htsize>0); + for (int i=0; i>4, th=buf[p]&15; + if (tc>=2 || th>=4) break; + jassert(tc>=0 && tc<2 && th>=0 && th<4); + HUF* h=&huf[tc*64+th*16]; // [tc][th][0]; + int val=p+17; // pointer to values + int hval=tc*1024+th*256; // pointer to RS values in hbuf + for (int j=0; j<256; ++j) // copy RS codes + hbuf[hval+j]=buf[val+j]; + int code=0; + for (int j=0; j<16; ++j) { + h[j].min=code; + h[j].max=code+=buf[p+j+1]; + h[j].val=hval; + val+=buf[p+j+1]; + hval+=buf[p+j+1]; + code*=2; + } + p=val; + jassert(hval>=0 && hval<2048); + } + jassert(p==end); + } + huffcode=huffbits=huffsize=0, rs=-1; + + // Build Huffman table selection table (indexed by mcupos). + // Get image width. + if (!sof && sos) return next_jpeg; + int ns=buf[sos+4]; + int nf=buf[sof+9]; + jassert(ns<=4 && nf<=4); + mcusize=0; // blocks per MCU + int hmax=0; // MCU horizontal dimension + for (int i=0; i>4>hmax) hmax=hv>>4; + hv=(hv&15)*(hv>>4); // number of blocks in component C + jassert(hv>=1 && hv+mcusize<=10); + while (hv) { + jassert(mcusize<10); + hufsel[0][mcusize]=buf[sos+2*i+6]>>4&15; + hufsel[1][mcusize]=buf[sos+2*i+6]&15; + jassert (hufsel[0][mcusize]<4 && hufsel[1][mcusize]<4); + color[mcusize]=i; + --hv; + ++mcusize; + } + } + } + } + jassert(hmax>=1 && hmax<=10); + width=buf[sof+7]*256+buf[sof+8]; // in pixels + int height=buf[sof+5]*256+buf[sof+6]; + printf("JPEG %dx%d ", width, height); + width=(width-1)/(hmax*8)+1; // in MCU + jassert(width>0); + mcusize*=64; // coefficients per MCU + row=column=0; + } + } + + + // Decode Huffman + { + if (mcusize && buf(1+(!bpos))!=FF) { // skip stuffed byte + jassert(huffbits<=32); + huffcode+=huffcode+y; + ++huffbits; + if (rs<0) { + jassert(huffbits>=1 && huffbits<=16); + const int ac=(mcupos&63)>0; + jassert(mcupos>=0 && (mcupos>>6)<10); + jassert(ac==0 || ac==1); + const int sel=hufsel[ac][mcupos>>6]; + jassert(sel>=0 && sel<4); + const int i=huffbits-1; + jassert(i>=0 && i<16); + const HUF *h=&huf[ac*64+sel*16]; // [ac][sel]; + jassert(h[i].min<=h[i].max && h[i].val<2048 && huffbits>0); + if (huffcode=h[i].min); + int k=h[i].val+huffcode-h[i].min; + jassert(k>=0 && k<2048); + rs=hbuf[k]; + huffsize=huffbits; + } + } + if (rs>=0) { + if (huffsize+(rs&15)==huffbits) { // done decoding + huff4=huff3; + huff3=huff2; + huff2=huff1; + huff1=hash(huffcode, huffbits); + rs4=rs3; + rs3=rs2; + rs2=rs1; + rs1=rs; + int x=0; // decoded extra bits + if (mcupos&63) { // AC + if (rs==0) { // EOB + mcupos=mcupos+63&-64; + jassert(mcupos>=0 && mcupos<=mcusize && mcupos<=640); + while (cpos&63) cbuf[cpos++]=0; + } + else { // rs = r zeros + s extra bits for the next nonzero value + // If first extra bit is 0 then value is negative. + jassert((rs&15)<=10); + const int r=rs>>4; + const int s=rs&15; + jassert(mcupos>>6==mcupos+r>>6); + mcupos+=r+1; + x=huffcode&(1<>s-1)) x-=(1<=1; --i) cbuf[cpos++]=i<<4|s; + cbuf[cpos++]=s<<4|huffcode<<2>>s&3|12; + ssum+=s; + } + } + else { // DC: rs = 0S, s<12 + jassert(rs<12); + ++mcupos; + x=huffcode&(1<>rs-1)) x-=(1<=0 && mcupos>>6<10); + const int comp=color[mcupos>>6]; + jassert(comp>=0 && comp<4); + dc=pred[comp]+=x; + jassert((cpos&63)==0); + cbuf[cpos++]=dc+1023>>3; + ssum4=ssum3; + ssum3=ssum2; + ssum2=ssum1; + ssum1=ssum; + ssum=rs; + } + jassert(mcupos>=0 && mcupos<=mcusize); + if (mcupos>=mcusize) { + mcupos=0; + if (++column==width) column=0, ++row; + } + huffcode=huffsize=huffbits=0, rs=-1; + } + } + } + } + + // Estimate next bit probability + if (!jpeg || !data) return next_jpeg; + + // Context model + const int N=19; // size of t, number of contexts + static BH<9> t(MEM); // context hash -> bit history + // As a cache optimization, the context does not include the last 1-2 + // bits of huffcode if the length (huffbits) is not a multiple of 3. + // The 7 mapped values are for context+{"", 0, 00, 01, 1, 10, 11}. + static Array cxt(N); // context hashes + static Array cp(N); // context pointers + static StateMap sm[N]; + static Mixer m1(32, 800, 4); + static APM a1(1024), a2(0x10000); + const static U8 zzu[64]={ // zigzag coef -> u,v + 0,1,0,0,1,2,3,2,1,0,0,1,2,3,4,5,4,3,2,1,0,0,1,2,3,4,5,6,7,6,5,4, + 3,2,1,0,1,2,3,4,5,6,7,7,6,5,4,3,2,3,4,5,6,7,7,6,5,4,5,6,7,7,6,7}; + const static U8 zzv[64]={ + 0,0,1,2,1,0,0,1,2,3,4,3,2,1,0,0,1,2,3,4,5,6,5,4,3,2,1,0,0,1,2,3, + 4,5,6,7,7,6,5,4,3,2,1,2,3,4,5,6,7,7,6,5,4,3,4,5,6,7,7,6,5,6,7,7}; + + + // Update model + if (cp[N-1]) { + for (int i=0; i>6]; + const int coef=(mcupos&63)|comp<<6; + const int hc=huffcode|1<2 || huffbits==0) hbcount=0; + jassert(coef>=0 && coef<256); + const int zu=zzu[mcupos&63], zv=zzv[mcupos&63]; + if (hbcount==0) { + const int mpos=mcupos>>4|!(mcupos&-64)<<7; + int n=0; + cxt[0]=hash(++n, hc, mcupos>>2, min(3, mcupos&63)); + cxt[1]=hash(++n, hc, mpos>>4, cbuf[cpos-mcusize]); + cxt[2]=hash(++n, hc, mpos>>4, cbuf[cpos-width*mcusize]); + cxt[3]=hash(++n, hc, ilog(ssum3), coef); + cxt[4]=hash(++n, hc, coef, column>>3); + cxt[5]=hash(++n, hc, coef, column>>1); + cxt[6]=hash(++n, hc, rs1, mpos); + cxt[7]=hash(++n, hc, rs1, rs2); + cxt[8]=hash(++n, hc, rs1, rs2, rs3); + cxt[9]=hash(++n, hc, ssum>>4, mcupos); + cxt[10]=hash(++n, hc, mpos, cbuf[cpos-1]); + cxt[11]=hash(++n, hc, dc); + cxt[12]=hash(++n, hc, rs1, coef); + cxt[13]=hash(++n, hc, rs1, rs2, coef); + cxt[14]=hash(++n, hc, mcupos>>3, ssum3>>3); + cxt[15]=hash(++n, hc, huff1); + cxt[16]=hash(++n, hc, coef, huff1); + cxt[17]=hash(++n, hc, zu, comp); + cxt[18]=hash(++n, hc, zv, comp); + } + + // Predict next bit + m1.add(128); + assert(hbcount<=2); + for (int i=0; i4))); + } + cm.mix(m); +} + +//////////////////////////// indirectModel ///////////////////// + +// The context is a byte string history that occurs within a +// 1 or 2 byte context. + +void indirectModel(Mixer& m) { + static ContextMap cm(MEM, 6); + static U32 t1[256]; + static U16 t2[0x10000]; + + if (!bpos) { + U32 d=c4&0xffff, c=d&255; + U32& r1=t1[d>>8]; + r1=r1<<8|c; + U16& r2=t2[c4>>8&0xffff]; + r2=r2<<8|c; + U32 t=c|t1[c]<<8; + cm.set(t&0xffff); + cm.set(t&0xffffff); + cm.set(t); + cm.set(t&0xff00); + t=d|t2[d]<<16; + cm.set(t&0xffffff); + cm.set(t); + + } + cm.mix(m); +} + +//////////////////////////// dmcModel ////////////////////////// + +// Model using DMC. The bitwise context is represented by a state graph, +// initilaized to a bytewise order 1 model as in +// http://plg.uwaterloo.ca/~ftp/dmc/dmc.c but with the following difference: +// - It uses integer arithmetic. +// - The threshold for cloning a state increases as memory is used up. +// - Each state maintains both a 0,1 count and a bit history (as in a +// context model). The 0,1 count is best for stationary data, and the +// bit history for nonstationary data. The bit history is mapped to +// a probability adaptively using a StateMap. The two computed probabilities +// are combined. +// - When memory is used up the state graph is reinitialized to a bytewise +// order 1 context as in the original DMC. However, the bit histories +// are not cleared. + +struct DMCNode { // 12 bytes + unsigned int nx[2]; // next pointers + U8 state; // bit history + unsigned int c0:12, c1:12; // counts * 256 +}; + +void dmcModel(Mixer& m) { + static int top=0, curr=0; // allocated, current node + static Array t(MEM*2); // state graph + static StateMap sm; + static int threshold=256; + + // clone next state + if (top>0 && top=threshold*2 && nn-n>=threshold*3) { + int r=n*4096/nn; + assert(r>=0 && r<=4096); + t[next].c0 -= t[top].c0 = t[next].c0*r>>12; + t[next].c1 -= t[top].c1 = t[next].c1*r>>12; + t[top].nx[0]=t[next].nx[0]; + t[top].nx[1]=t[next].nx[1]; + t[top].state=t[next].state; + t[curr].nx[y]=top; + ++top; + if (top==MEM*2) threshold=512; + if (top==MEM*3) threshold=768; + } + } + + // Initialize to a bytewise order 1 model at startup or when flushing memory + if (top==t.size() && bpos==1) top=0; + if (top==0) { + assert(t.size()>=65536); + for (int i=0; i<256; ++i) { + for (int j=0; j<256; ++j) { + if (i<127) { + t[j*256+i].nx[0]=j*256+i*2+1; + t[j*256+i].nx[1]=j*256+i*2+2; + } + else { + t[j*256+i].nx[0]=(i-127)*256; + t[j*256+i].nx[1]=(i+1)*256; + } + t[j*256+i].c0=128; + t[j*256+i].c1=128; + } + } + top=65536; + curr=0; + threshold=256; + } + + // update count, state + if (y) { + if (t[curr].c1<3800) t[curr].c1+=256; + } + else if (t[curr].c0<3800) t[curr].c0+=256; + t[curr].state=nex(t[curr].state, y); + curr=t[curr].nx[y]; + + // predict + const int pr1=sm.p(t[curr].state); + const int n1=t[curr].c1; + const int n0=t[curr].c0; + const int pr2=(n1+5)*4096/(n0+n1+10); + m.add(stretch(pr1)); + m.add(stretch(pr2)); +} + +//////////////////////////// contextModel ////////////////////// + +typedef enum {DEFAULT, JPEG, EXE, TEXT} Filetype; + +// This combines all the context models with a Mixer. + +int contextModel2() { + static ContextMap cm(MEM*32, 9); + static RunContextMap rcm7(MEM), rcm9(MEM), rcm10(MEM); + static Mixer m(800, 3088, 7, 128); + static U32 cxt[16]; // order 0-11 contexts + static Filetype filetype=DEFAULT; + static int size=0; // bytes remaining in block +// static const char* typenames[4]={"", "jpeg ", "exe ", "text "}; + + // Parse filetype and size + if (bpos==0) { + --size; + if (size==-1) filetype=(Filetype)buf(1); + if (size==-5) { + size=buf(4)<<24|buf(3)<<16|buf(2)<<8|buf(1); +// if (filetype<=3) printf("(%s%d)", typenames[filetype], size); + if (filetype==EXE) size+=8; + } + } + + m.update(); + m.add(256); + + // Test for special file types + int isjpeg=jpegModel(m); // 1 if JPEG is detected, else 0 + int ismatch=ilog(matchModel(m)); // Length of longest matching context + int isbmp=bmpModel(m); // Image width (bytes) if BMP or TIFF detected, or 0 + + if (isjpeg) { + m.set(1, 8); + m.set(c0, 256); + m.set(buf(1), 256); + return m.p(); + } + else if (isbmp>0) { + static int col=0; + if (++col>=24) col=0; + m.set(2, 8); + m.set(col, 24); + m.set(buf(isbmp)+buf(3)>>4, 32); + m.set(c0, 256); + return m.p(); + } + + + // Normal model + if (bpos==0) { + for (int i=15; i>0; --i) // update order 0-11 context hashes + cxt[i]=cxt[i-1]*257+(c4&255)+1; + for (int i=0; i<7; ++i) + cm.set(cxt[i]); + rcm7.set(cxt[7]); + cm.set(cxt[8]); + rcm9.set(cxt[10]); + rcm10.set(cxt[12]); + cm.set(cxt[14]); + } + int order=cm.mix(m); + + rcm7.mix(m); + rcm9.mix(m); + rcm10.mix(m); + + if (level>=4) { + sparseModel(m,ismatch,order); + distanceModel(m); + picModel(m); + recordModel(m); + wordModel(m); + indirectModel(m); + dmcModel(m); + if (filetype==EXE) exeModel(m); + } + + + + order = order-2; + if(order<0) order=0; + + U32 c1=buf(1), c2=buf(2), c3=buf(3), c; + + m.set(c1+8, 264); + m.set(c0, 256); + m.set(order+8*(c4>>5&7)+64*(c1==c2)+128*(filetype==EXE), 256); + m.set(c2, 256); + m.set(c3, 256); + m.set(ismatch, 256); + + if(bpos) + { + c=c0<<(8-bpos); if(bpos==1)c+=c3/2; + c=(min(bpos,5))*256+c1/32+8*(c2/32)+(c&192); + } + else c=c3/128+(c4>>31)*2+4*(c2/64)+(c1&240); + m.set(c, 1536); + int pr=m.p(); + return pr; +} + + +//////////////////////////// Predictor ///////////////////////// + +// A Predictor estimates the probability that the next bit of +// uncompressed data is 1. Methods: +// p() returns P(1) as a 12 bit number (0-4095). +// update(y) trains the predictor with the actual bit (0 or 1). + +class Predictor { + int pr; // next prediction +public: + Predictor(); + int p() const {assert(pr>=0 && pr<4096); return pr;} + void update(); +}; + +Predictor::Predictor(): pr(2048) {} + +void Predictor::update() { + static APM a(256), a1(0x10000), a2(0x10000), a3(0x10000), + a4(0x10000), a5(0x10000), a6(0x10000); + + // Update global context: pos, bpos, c0, c4, buf + c0+=c0+y; + if (c0>=256) { + buf[pos++]=c0; + c4=(c4<<8)+c0-256; + c0=1; + } + bpos=(bpos+1)&7; + + // Filter the context model with APMs + int pr0=contextModel2(); + + pr=a.p(pr0, c0); + + int pr1=a1.p(pr0, c0+256*buf(1)); + int pr2=a2.p(pr0, c0^hash(buf(1), buf(2))&0xffff); + int pr3=a3.p(pr0, c0^hash(buf(1), buf(2), buf(3))&0xffff); + pr0=pr0+pr1+pr2+pr3+2>>2; + + pr1=a4.p(pr, c0+256*buf(1)); + pr2=a5.p(pr, c0^hash(buf(1), buf(2))&0xffff); + pr3=a6.p(pr, c0^hash(buf(1), buf(2), buf(3))&0xffff); + pr=pr+pr1+pr2+pr3+2>>2; + + pr=pr+pr0+1>>1; +} + +//////////////////////////// Encoder //////////////////////////// + +// An Encoder does arithmetic encoding. Methods: +// Encoder(COMPRESS, f) creates encoder for compression to archive f, which +// must be open past any header for writing in binary mode. +// Encoder(DECOMPRESS, f) creates encoder for decompression from archive f, +// which must be open past any header for reading in binary mode. +// code(i) in COMPRESS mode compresses bit i (0 or 1) to file f. +// code() in DECOMPRESS mode returns the next decompressed bit from file f. +// Global y is set to the last bit coded or decoded by code(). +// compress(c) in COMPRESS mode compresses one byte. +// decompress() in DECOMPRESS mode decompresses and returns one byte. +// flush() should be called exactly once after compression is done and +// before closing f. It does nothing in DECOMPRESS mode. +// size() returns current length of archive +// setFile(f) sets alternate source to FILE* f for decompress() in COMPRESS +// mode (for testing transforms). +// If level (global) is 0, then data is stored without arithmetic coding. + +typedef enum {COMPRESS, DECOMPRESS} Mode; +class Encoder { +private: + Predictor predictor; + const Mode mode; // Compress or decompress? + FILE* archive; // Compressed data file + U32 x1, x2; // Range, initially [0, 1), scaled by 2^32 + U32 x; // Decompress mode: last 4 input bytes of archive + FILE *alt; // decompress() source in COMPRESS mode + + // Compress bit y or return decompressed bit + int code(int i=0) { + int p=predictor.p(); + assert(p>=0 && p<4096); + p+=p<2048; + U32 xmid=x1 + (x2-x1>>12)*p + ((x2-x1&0xfff)*p>>12); + assert(xmid>=x1 && xmid>24, archive); + x1<<=8; + x2=(x2<<8)+255; + if (mode==DECOMPRESS) x=(x<<8)+(getc(archive)&255); // EOF is OK + } + return y; + } + +public: + Encoder(Mode m, FILE* f); + Mode getMode() const {return mode;} + long size() const {return ftell(archive);} // length of archive so far + void flush(); // call this when compression is finished + void setFile(FILE* f) {alt=f;} + + // Compress one byte + void compress(int c) { + assert(mode==COMPRESS); + if (level==0) + putc(c, archive); + else + for (int i=7; i>=0; --i) + code((c>>i)&1); + } + + // Decompress and return one byte + int decompress() { + if (mode==COMPRESS) { + assert(alt); + return getc(alt); + } + else if (level==0) + return getc(archive); + else { + int c=0; + for (int i=0; i<8; ++i) + c+=c+code(); + return c; + } + } +}; + +Encoder::Encoder(Mode m, FILE* f): + mode(m), archive(f), x1(0), x2(0xffffffff), x(0), alt(0) { + if (level>0 && mode==DECOMPRESS) { // x = first 4 bytes of archive + for (int i=0; i<4; ++i) + x=(x<<8)+(getc(archive)&255); + } +} + +void Encoder::flush() { + if (mode==COMPRESS && level>0) + putc(x1>>24, archive); // Flush first unequal byte of range +} + +/////////////////////////// Filters ///////////////////////////////// +// +// Before compression, data is encoded in blocks with the following format: +// +// +// +// Type is 1 byte (type Filetype): DEFAULT=0, JPEG, EXE +// Size is 4 bytes in big-endian format. +// Encoded-data decodes to bytes. The encoded size might be +// different. Encoded data is designed to be more compressible. +// +// void encode(FILE* in, FILE* out, int n); +// +// Reads n bytes of in (open in "rb" mode) and encodes one or +// more blocks to temporary file out (open in "wb+" mode). +// The file pointer of in is advanced n bytes. The file pointer of +// out is positioned after the last byte written. +// +// en.setFile(FILE* out); +// int decode(Encoder& en); +// +// Decodes and returns one byte. Input is from en.decompress(), which +// reads from out if in COMPRESS mode. During compression, n calls +// to decode() must exactly match n bytes of in, or else it is compressed +// as type 0 without encoding. +// +// Filetype detect(FILE* in, int n, Filetype type); +// +// Reads n bytes of in, and detects when the type changes to +// something else. If it does, then the file pointer is repositioned +// to the start of the change and the new type is returned. If the type +// does not change, then it repositions the file pointer n bytes ahead +// and returns the old type. +// +// For each type X there are the following 2 functions: +// +// void encode_X(FILE* in, FILE* out, int n, ...); +// +// encodes n bytes from in to out. +// +// int decode_X(Encoder& en); +// +// decodes one byte from en and returns it. decode() and decode_X() +// maintain state information using static variables. + +// Detect EXE or JPEG data +Filetype detect(FILE* in, int n, Filetype type) { + U32 buf1=0, buf0=0; // last 8 bytes + long start=ftell(in); + + // For EXE detection + Array abspos(256), // CALL/JMP abs. addr. low byte -> last offset + relpos(256); // CALL/JMP relative addr. low byte -> last offset + int e8e9count=0; // number of consecutive CALL/JMPs + int e8e9pos=0; // offset of first CALL or JMP instruction + int e8e9last=0; // offset of most recent CALL or JMP + + // For JPEG detection + int soi=0, sof=0, sos=0; // position where found + + for (int i=0; i>24; + buf0=buf0<<8|c; + + // Detect JPEG by code SOI APPx (FF D8 FF Ex) followed by + // SOF0 (FF C0 xx xx 08) and SOS (FF DA) within a reasonable distance. + // Detect end by any code other than RST0-RST7 (FF D9-D7) or + // a byte stuff (FF 00). + + if (i>=3 && (buf0&0xfffffff0)==0xffd8ffe0) soi=i; + if (soi && i-soi<0x10000 && (buf1&0xff)==0xff + && (buf0&0xff0000ff)==0xc0000008) + sof=i; + if (soi && sof && sof>soi && i-soi<0x10000 && i-sof<0x1000 + && (buf0&0xffff)==0xffda) { + sos=i; + if (type!=JPEG) return fseek(in, start+soi-3, SEEK_SET), JPEG; + } + if (type==JPEG && sos && i>sos && (buf0&0xff00)==0xff00 + && (buf0&0xff)!=0 && (buf0&0xf8)!=0xd0) + return DEFAULT; + + // Detect EXE if the low order byte (little-endian) XX is more + // recently seen (and within 4K) if a relative to absolute address + // conversion is done in the context CALL/JMP (E8/E9) XX xx xx 00/FF + // 4 times in a row. Detect end of EXE at the last + // place this happens when it does not happen for 64KB. + + if ((buf1&0xfe)==0xe8 && (buf0+1&0xfe)==0) { + int r=buf0>>24; // relative address low 8 bits + int a=(buf0>>24)+i&0xff; // absolute address low 8 bits + int rdist=i-relpos[r]; + int adist=i-abspos[a]; + if (adist5) { + e8e9last=i; + ++e8e9count; + if (e8e9pos==0 || e8e9pos>abspos[a]) e8e9pos=abspos[a]; + } + else e8e9count=0; + if (type!=EXE && e8e9count>=4 && e8e9pos>5) + return fseek(in, start+e8e9pos-5, SEEK_SET), EXE; + abspos[a]=i; + relpos[r]=i; + } + if (type==EXE && i-e8e9last>0x1000) + return fseek(in, start+e8e9last, SEEK_SET), DEFAULT; + } + return type; +} + +// Default encoding as self +void encode_default(FILE* in, FILE* out, int len) { + while (len--) putc(getc(in), out); +} + +int decode_default(Encoder& en) { + return en.decompress(); +} + +// JPEG encode as self. The purpose is to shield jpegs from exe transform. +void encode_jpeg(FILE* in, FILE* out, int len) { + while (len--) putc(getc(in), out); +} + +int decode_jpeg(Encoder& en) { + return en.decompress(); +} + +// EXE transform: ... +// Encoded-size is 4 bytes, MSB first. +// begin is the offset of the start of the input file, 4 bytes, MSB first. +// Each block applies the e8e9 transform to strings falling entirely +// within the block starting from the end and working backwards. +// The 5 byte pattern is E8/E9 xx xx xx 00/FF (x86 CALL/JMP xxxxxxxx) +// where xxxxxxxx is a relative address LSB first. The address is +// converted to an absolute address by adding the offset mod 2^25 +// (in range +-2^24). + +void encode_exe(FILE* in, FILE* out, int len, int begin) { + const int BLOCK=0x10000; + Array blk(BLOCK); + fprintf(out, "%c%c%c%c", len>>24, len>>16, len>>8, len); // size, MSB first + fprintf(out, "%c%c%c%c", begin>>24, begin>>16, begin>>8, begin); + + // Transform + for (int offset=0; offset=4; --i) { + if ((blk[i-4]==0xe8||blk[i-4]==0xe9) && (blk[i]==0||blk[i]==0xff)) { + int a=(blk[i-3]|blk[i-2]<<8|blk[i-1]<<16|blk[i]<<24)+offset+begin+i+1; + a<<=7; + a>>=7; + blk[i]=a>>24; + blk[i-1]=a>>16; + blk[i-2]=a>>8; + blk[i-3]=a; + } + } + fwrite(&blk[0], 1, bytesRead, out); + } +} + +int decode_exe(Encoder& en) { + const int BLOCK=0x10000; // block size + static int offset=0, q=0; // decode state: file offset, queue size + static int size=0; // where to stop coding + static int begin=0; // offset in file + static U8 c[5]; // queue of last 5 bytes, c[0] at front + + // Read size from first 4 bytes, MSB first + while (offset==size && q==0) { + offset=0; + size=en.decompress()<<24; + size|=en.decompress()<<16; + size|=en.decompress()<<8; + size|=en.decompress(); + begin=en.decompress()<<24; + begin|=en.decompress()<<16; + begin|=en.decompress()<<8; + begin|=en.decompress(); + } + + // Fill queue + while (offset subtract location from x + if (q==5 && (c[4]==0xe8||c[4]==0xe9) && (c[0]==0||c[0]==0xff) + && ((offset-1^offset-5)&-BLOCK)==0) { // not crossing block boundary + int a=(c[3]|c[2]<<8|c[1]<<16|c[0]<<24)-offset-begin; + a<<=7; + a>>=7; + c[3]=a; + c[2]=a>>8; + c[1]=a>>16; + c[0]=a>>24; + } + + // return oldest byte in queue + assert(q>0 && q<=5); + return c[--q]; +} + + + +// Split n bytes into blocks by type. For each block, output +// and call encode_X to convert to type X. +void encode(FILE* in, FILE* out, int n) { + Filetype type=DEFAULT; + long begin=ftell(in); + while (n>0) { + Filetype nextType=detect(in, n, type); + long end=ftell(in); + fseek(in, begin, SEEK_SET); + int len=int(end-begin); + if (len>0) { + fprintf(out, "%c%c%c%c%c", type, len>>24, len>>16, len>>8, len); + switch(type) { + case JPEG: encode_jpeg(in, out, len); break; + case EXE: encode_exe(in, out, len, begin); break; + default: encode_default(in, out, len); break; + } + } + n-=len; + type=nextType; + begin=end; + } +} + +// Decode ... +int decode(Encoder& en) { + static Filetype type=DEFAULT; + static int len=0; + while (len==0) { + type=(Filetype)en.decompress(); + len=en.decompress()<<24; + len|=en.decompress()<<16; + len|=en.decompress()<<8; + len|=en.decompress(); + if (len<0) len=1; + } + --len; + switch (type) { + case JPEG: return decode_jpeg(en); + case EXE: return decode_exe(en); + default: return decode_default(en); + } +} + +//////////////////// Compress, Decompress //////////////////////////// + +// Print progress: n is the number of bytes compressed or decompressed +void printStatus(int n) { + if (n>0 && !(n&0x0fff)) + printf("%12d\b\b\b\b\b\b\b\b\b\b\b\b", n), fflush(stdout); +} + +// Compress a file +void compress(const char* filename, long filesize, Encoder& en) { + assert(en.getMode()==COMPRESS); + assert(filename && filename[0]); + FILE *f=fopen(filename, "rb"); + if (!f) perror(filename), quit(); + long start=en.size(); + printf("%s %ld -> ", filename, filesize); + + // Transform and test in blocks + const int BLOCK=MEM*64; + for (int i=0; filesize>0; i+=BLOCK) { + int size=BLOCK; + if (size>filesize) size=filesize; + FILE* tmp=tmpfile(); + if (!tmp) perror("tmpfile"), quit(); + long savepos=ftell(f); + encode(f, tmp, size); + + // Test transform + rewind(tmp); + en.setFile(tmp); + fseek(f, savepos, SEEK_SET); + long j; + int c1=0, c2=0; + for (j=0; j>24); + en.compress(size>>16); + en.compress(size>>8); + en.compress(size); + fseek(f, savepos, SEEK_SET); + for (int j=0; j ", filename, filesize); + bool found=false; // mismatch? + for (int i=0; i ", filename, filesize); + for (int i=0; i ", filename, filesize); + for (int i=0; i=s.size()) s.resize(len*2+1); + if (c!='\r') s[len++]=c; + } + if (len>=s.size()) s.resize(len+1); + s[len]=0; + if (c==EOF || c==26) + return 0; + else + return s.c_str(); +} + +// int expand(String& archive, String& s, const char* fname, int base) { +// Given file name fname, print its length and base name (beginning +// at fname+base) to archive in format "%ld\t%s\r\n" and append the +// full name (including path) to String s in format "%s\n". If fname +// is a directory then substitute all of its regular files and recursively +// expand any subdirectories. Base initially points to the first +// character after the last / in fname, but in subdirectories includes +// the path from the topmost directory. Return the number of files +// whose names are appended to s and archive. + +// Same as expand() except fname is an ordinary file +int putsize(String& archive, String& s, const char* fname, int base) { + int result=0; + FILE *f=fopen(fname, "rb"); + if (f) { + fseek(f, 0, SEEK_END); + long len=ftell(f); + if (len>=0) { + static char blk[24]; + sprintf(blk, "%ld\t", len); + archive+=blk; + archive+=(fname+base); + archive+="\r\n"; + s+=fname; + s+="\n"; + ++result; + } + fclose(f); + } + return result; +} + +#ifdef WINDOWS + +int expand(String& archive, String& s, const char* fname, int base) { + int result=0; + DWORD attr=GetFileAttributes(fname); + if ((attr != 0xFFFFFFFF) && (attr & FILE_ATTRIBUTE_DIRECTORY)) { + WIN32_FIND_DATA ffd; + String fdir(fname); + fdir+="/*"; + HANDLE h=FindFirstFile(fdir.c_str(), &ffd); + while (h!=INVALID_HANDLE_VALUE) { + if (!equals(ffd.cFileName, ".") && !equals(ffd.cFileName, "..")) { + String d(fname); + d+="/"; + d+=ffd.cFileName; + result+=expand(archive, s, d.c_str(), base); + } + if (FindNextFile(h, &ffd)!=TRUE) break; + } + FindClose(h); + } + else // ordinary file + result=putsize(archive, s, fname, base); + return result; +} + +#else +#ifdef UNIX + +int expand(String& archive, String& s, const char* fname, int base) { + int result=0; + struct stat sb; + if (stat(fname, &sb)<0) return 0; + + // If a regular file and readable, get file size + if (sb.st_mode & S_IFREG && sb.st_mode & 0400) + result+=putsize(archive, s, fname, base); + + // If a directory with read and execute permission, traverse it + else if (sb.st_mode & S_IFDIR && sb.st_mode & 0400 && sb.st_mode & 0100) { + DIR *dirp=opendir(fname); + if (!dirp) { + perror("opendir"); + return result; + } + dirent *dp; + while(errno=0, (dp=readdir(dirp))!=0) { + if (!equals(dp->d_name, ".") && !equals(dp->d_name, "..")) { + String d(fname); + d+="/"; + d+=dp->d_name; + result+=expand(archive, s, d.c_str(), base); + } + } + if (errno) perror("readdir"); + closedir(dirp); + } + else printf("%s is not a readable file or directory\n", fname); + return result; +} + +#else // Not WINDOWS or UNIX, ignore directories + +int expand(String& archive, String& s, const char* fname, int base) { + return putsize(archive, s, fname, base); +} + +#endif +#endif + + +// To compress to file1.paq8l: paq8l [-n] file1 [file2...] +// To decompress: paq8l file1.paq8l [output_dir] +int main(int argc, char** argv) { + bool pause=argc<=2; // Pause when done? + try { + + // Get option + bool doExtract=false; // -d option + if (argc>1 && argv[1][0]=='-' && argv[1][1] && !argv[1][2]) { + if (argv[1][1]>='0' && argv[1][1]<='9') + level=argv[1][1]-'0'; + else if (argv[1][1]=='d') + doExtract=true; + else + quit("Valid options are -0 through -9 or -d\n"); + --argc; + ++argv; + pause=false; + } + + // Print help message + if (argc<2) { + printf(PROGNAME " archiver (C) 2006, Matt Mahoney et al.\n" + "Free under GPL, http://www.gnu.org/licenses/gpl.txt\n\n" +#ifdef WINDOWS + "To compress or extract, drop a file or folder on the " + PROGNAME " icon.\n" + "The output will be put in the same folder as the input.\n" + "\n" + "Or from a command window: " +#endif + "To compress:\n" + " " PROGNAME " -level file (compresses to file." PROGNAME ")\n" + " " PROGNAME " -level archive files... (creates archive." PROGNAME ")\n" + " " PROGNAME " file (level -%d, pause when done)\n" + "level: -0 = store, -1 -2 -3 = faster (uses 35, 48, 59 MB)\n" + "-4 -5 -6 -7 -8 = smaller (uses 133, 233, 435, 837, 1643 MB)\n" +#if defined(WINDOWS) || defined (UNIX) + "You may also compress directories.\n" +#endif + "\n" + "To extract or compare:\n" + " " PROGNAME " -d dir1/archive." PROGNAME " (extract to dir1)\n" + " " PROGNAME " -d dir1/archive." PROGNAME " dir2 (extract to dir2)\n" + " " PROGNAME " archive." PROGNAME " (extract, pause when done)\n" + "\n" + "To view contents: more < archive." PROGNAME "\n" + "\n", + DEFAULT_OPTION); + quit(); + } + + FILE* archive=0; // compressed file + int files=0; // number of files to compress/decompress + Array fname(1); // file names (resized to files) + Array fsize(1); // file lengths (resized to files) + + // Compress or decompress? Get archive name + Mode mode=COMPRESS; + String archiveName(argv[1]); + { + const int prognamesize=strlen(PROGNAME); + const int arg1size=strlen(argv[1]); + if (arg1size>prognamesize+1 && argv[1][arg1size-prognamesize-1]=='.' + && equals(PROGNAME, argv[1]+arg1size-prognamesize)) { + mode=DECOMPRESS; + } + else if (doExtract) + mode=DECOMPRESS; + else { + archiveName+="."; + archiveName+=PROGNAME; + } + } + + // Compress: write archive header, get file names and sizes + String filenames; + if (mode==COMPRESS) { + + // Expand filenames to read later. Write their base names and sizes + // to archive. + String header_string; + for (int i=1; i0 && name[len-1]=='/') // remove trailing / + name[--len]=0; + int base=len-1; + while (base>=0 && name[base]!='/') --base; // find last / + ++base; + if (base==0 && len>=2 && name[1]==':') base=2; // chop "C:" + int expanded=expand(header_string, filenames, name.c_str(), base); + if (!expanded && (i>1||argc==2)) + printf("%s: not found, skipping...\n", name.c_str()); + files+=expanded; + } + + // If archive doesn't exist and there is at least one file to compress + // then create the archive header. + if (files<1) quit("Nothing to compress\n"); +// archive=fopen(archiveName.c_str(), "rb"); +// if (archive) +// printf("%s already exists\n", archiveName.c_str()), quit(); + archive=fopen(archiveName.c_str(), "wb+"); + if (!archive) perror(archiveName.c_str()), quit(); + fprintf(archive, PROGNAME " -%d\r\n%s\x1A", + level, header_string.c_str()); + printf("Creating archive %s with %d file(s)...\n", + archiveName.c_str(), files); + + // Fill fname[files], fsize[files] with input filenames and sizes + fname.resize(files); + fsize.resize(files); + char *p=&filenames[0]; + rewind(archive); + getline(archive); + for (int i=0; i=0); + fname[i]=p; + while (*p!='\n') ++p; + assert(p-filenames.c_str()9) level=DEFAULT_OPTION; + + // Fill fname[files], fsize[files] with output file names and sizes + while (getline(archive)) ++files; // count files + printf("Extracting %d file(s) from %s -%d\n", files, + archiveName.c_str(), level); + long header_size=ftell(archive); + filenames.resize(header_size+4); // copy of header + rewind(archive); + fread(&filenames[0], 1, header_size, archive); + fname.resize(files); + fsize.resize(files); + char* p=&filenames[0]; + while (*p && *p!='\r') ++p; // skip first line + ++p; + for (int i=0; i=0 && level<=9); + buf.setsize(MEM*8); + + // Compress or decompress files + assert(fname.size()==files); + assert(fsize.size()==files); + long total_size=0; // sum of file sizes + for (int i=0; i %ld\n", total_size, en.size()); + } + + // Decompress files to dir2: paq8l -d dir1/archive.paq8l dir2 + // If there is no dir2, then extract to dir1 + // If there is no dir1, then extract to . + else { + assert(argc>=2); + String dir(argc>2?argv[2]:argv[1]); + if (argc==2) { // chop "/archive.paq8l" + int i; + for (i=dir.size()-2; i>=0; --i) { + if (dir[i]=='/' || dir[i]=='\\') { + dir[i]=0; + break; + } + if (i==1 && dir[i]==':') { // leave "C:" + dir[i+1]=0; + break; + } + } + if (i==-1) dir="."; // "/" not found + } + dir=dir.c_str(); + if (dir[0] && (dir.size()!=3 || dir[1]!=':')) dir+="/"; + for (int i=0; i. + +paq9a is an experimental file compressor and archiver. Usage: + + paq9a {a|x|l} archive [[-opt] files...]... + +Commands: + + a = create archive and compress named files. + x = extract from archive. + l = list contents. + +Archives are "solid". You can only create new archives. You cannot +modify existing archives. File names are stored and extracted exactly as +named when the archive is created, but you have the option to rename them +during extraction. Files are never clobbered. + +The "a" command creates a new archive and adds the named files. +Wildcards are permitted if compiled with g++. Options +and filenames may be in any order. Options apply only to filenames +after the option, and override previous options. Options are: + + -s = store without compression. + -c = compress (default). + -1 through -9 selects memory level from 18 MB to 1.5 GB Default is -7 + using 405 MB. The memory option must be set before the first file. + Decompression requires the same amount of memory. + +For example: + + paq9a a foo.paq9a a.txt -3 -s b.txt -c c.txt tmp/d.txt /tmp/e.txt + +creates the archive foo.paq9a with 5 files. The file b.txt is +stored without compression. The other 4 files are compressed +at memory level 3. Extraction requires the same memory as compression. + +If any named file does not exist, then it is omitted from the archive +with a warning and the remaining files are added. An existing +archive cannot be overwritten. There must be at least one filename on +the command line. + +The "x" command extracts the archive contents, creating files exactly +as named when the archive was created. Files cannot be overwritten. +If a file already exists or cannot be created, then it is skipped. +For example, "tmp/d.txt" would be skipped if either the current +directory does not have a subdirectory tmp, or tmp is write +protected, or tmp/d.txt already exists. + +If "x" is followed by one or more file names, then the output files +are renamed in the order they were added to the archive and any remaining +contents are extracted without renaming. For example: + + paq9a x foo.paq9a x.txt y.txt + +would extract a.txt to x.txt and b.txt to y.txt, then extract c.txt, +tmp/d.txt and /tmp/e.txt. If the command line has more filenames than +the archive then the extra arguments are ignored. Options are not +allowed. + +The "l" (letter l) command lists the contents. Any extra arguments +are ignored. + +Any other command, or no command, displays a help message. + + +ARCHIVE FORMAT + + "lPq" 1 mem [filename {'\0' mode usize csize contents}...]... + +The first 4 bytes are "lPq\x01" (1 is the version number). + +mem is a digit '1' through '9', where '9' uses the most memory (1.5 GB). + +A file is stored as one or more blocks. The filename is stored +only in the first block as a NUL terminated string. Subsequent +blocks start with a 0. + +The mode is 's' if the block is stored and 'c' if compressed. + +usize = uncompressed size as a 4 byte big-endian number (MSB first). + +csize = compressed size as a 4 byte big-endian number. + +The contents is copied from the file itself if mode is 's' or the +compressed contents otherwise. Its length is exactly csize bytes. + + +COMPRESSED FORMAT + +Files are preprocessed with LZP and then compressed with a context +mixing compressor and arithmetic coded one bit at a time. Model +contents are maintained across files. + +The LZP stage predicts the next byte by matching the current context +(order 12 or higher) to a rotating buffer. If a match is found +then the next byte after the match is predicted. If the next byte +matches the prediction, then a 1 bit is coded and the context is extended. +Otherwise a 0 is coded followed by 8 bits of the actual byte in MSB to +LSB order. + +A 1 bit is modeled using the match length as context, then refined +in 3 stages using sucessively longer contexts. The predictions are +adjusted by 2 input neurons selected by a context hash with the second +input fixed. + +If the LZP prediction is missed, then the literal is coded using a chain +of predicions which are mixed using neurons, where one input is the +previous prediction and the second input is the prediction given the +current context. The current context is mapped to an 8 bit state +representing the bit history, the sequence of bits previously observed +in that context. The bit history is used both to select the neuron +and is mapped to a prediction that provides the second input. In addition, +if the known bits of the current byte match the LZP incorrectly predicted +byte, then this fact is used to select one of 2 sets of neurons (512 total). + +The contexts, in order, are sparse order-1 with gaps of 3, 2, and 1 +byte, then orders 1 through 6, then word orders 0 and 1, where a word +is a sequenece of case insensitive letters (useful for compressing text). +Contexts longer than 1 are hashed. Order-n contexts consist of a hash +of the last n bytes plus the 0 to 7 known bits of the current byte. +The order 6 context and the word order 0 and 1 contexts also include +the LZP predicted byte. + +All mixing is in the logistic or "stretched" domain: stretch(p) = ln(p/(1-p)), +then "squashed" by the inverse function: squash(p) = 1/(1 + exp(-p)) before +arithmetic coding. A 2 input neuron has 2 weights (w0 and w1) +selected by context. Given inputs x0 and x1 (2 predictions, or one +prediction and a constant), the output prediction is computed: +p = w0*x0 + w1*x1. If the actual bit is y, then the weights are updated +to minimize its coding cost: + + error = y - squash(p) + w0 += x0 * error * L + w1 += x1 * error * L + +where L is the learning rate, normally 1/256, but increased by a factor +of 4 an 2 for the first 2 training cycles (using the 2 low bits +of w0 as a counter). In the implementation, p is represented by a fixed +point number with a 12 bit fractional part in the linear domain (0..4095) +and 8 bits in the logistic domain (-2047..2047 representing -8..8). +Weights are scaled by 24 bits. Both weights are initialized to 1/2, +expecting 2 probabilities, weighted equally). However, when one input +(x0) is fixed, its weight (w0) is initialized to 0. + +A bit history represents the sequence of 0 and 1 bits observed in a given +context. An 8 bit state represents all possible sequences up to 4 bits +long. Longer sequences are represented by a count of 0 and 1 bits, plus +an indicator of the most recent bit. If counts grow too large, then the +next state represents a pair of smaller counts with about the same ratio. +The state table is the same as used in PAQ8 (all versions) and LPAQ1. + +A state is mapped to a prediction by using a table. A table entry +contains 2 values, p, initialized to 1/2, and n, initialized to 0. +The output prediciton is p (in the linear domain, not stretched). +If the actual bit is y, then the entry is updated: + + error = y - p + p += error/(n + 1.5) + if n < limit then n += 1 + +In practice, p is scaled by 22 bits, and n is 10 bits, packed into +one 32 bit integer. The limit is 255. + +Every 4 bits, contexts are mapped to arrays of 15 states using a +hash table. The first element is the bit history for the current +context ending on a half byte boundary, followed by all possible +contexts formed by appending up to 3 more bits. + +A hash table accepts a 32 bit context, which must be a hash if +longer than 4 bytes. The input is further hashed and divided into +an index (depending on the table size, a power of 2), and an 8 bit +checksum which is stored in the table and used to detect collisions +(not perfectly). A lookup tests 3 adjacent locations within a single +64 byte cache line, and if a matching checksum is not found, then the +entry with the smallest value in the first data element is replaced +(LFU replacement policy). This element represents a bit history +for a context ending on a half byte boundary. The states are ordered +so that larger values represent larger total bit counts, which +estimates the likelihood of future use. The initial state is 0. + +Memory is allocated from MEM = pow(2, opt+22) bytes, where opt is 1 through +9 (user selected). Of this, MEM/2 is for the hash table for storing literal +context states, MEM/8 for the rotating LZP buffer, and MEM/8 for a +hash table of pointers into the buffer, plus 12 MB for miscellaneous data. +Total memory usage is 0.75*MEM + 12 MB. + + +ARITHMETIC CODING + +The arithmetic coder codes a bit with probability p using log2(1/p) bits. +Given input string y, the output is a binary fraction x such that +P(< y) <= x < P(<= y) where P(< y) means the total probability of all inputs +lexicographically less than y and P(<= y) = P(< y) + P(y). Note that one +can always find x with length at most log2(P(y)) + 1 bits. + +x can be computed efficiently by maintaining a range, low <= x < high +(initially 0..1) and expressing P(y) as a product of predictions: +P(y) = P(y1) P(y2|y1) P(y3|y1y2) P(y4|y1y2y3) ... P(yn|y1y2...yn-1) +where the term P(yi|y0y1...yi-1) means the probability that yi is 1 +given the context y1...yi-1, the previous i-1 bits of y. For each +prediction p, the range is split in proportion to the probabilities +of 0 and 1, then updated by taking the half corresponding to the actual +bit y as the new range, i.e. + + mid = low + (high - low) * p(y = 1) + if y = 0 then (low, high) := (mid, high) + if y = 1 then (low, high) := (low, mid) + +As low and high approach each other, the high order bits of x become +known (because they are the same throughout the range) and can be +output immediately. + +For decoding, the range is split as before and the range is updated +to the half containing x. The corresponding bit y is used to update +the model. Thus, the model has the same knowledge for coding and +decoding. + +*/ + +#include +#include +#include +#include +#include +#define NDEBUG // remove for debugging +#include + +int allocated=0; // Total memory allocated by alloc() + +// Create an array p of n elements of type T +template void alloc(T*&p, int n) { + p=(T*)calloc(n, sizeof(T)); + if (!p) printf("Out of memory\n"), exit(1); + allocated+=n*sizeof(T); +} + +// 8, 16, 32 bit unsigned types (adjust as appropriate) +typedef unsigned char U8; +typedef unsigned short U16; +typedef unsigned int U32; + +///////////////////////////// Squash ////////////////////////////// + +// return p = 1/(1 + exp(-d)), d scaled by 8 bits, p scaled by 12 bits +class Squash { + short tab[4096]; +public: + Squash(); + int operator()(int d) { + d+=2048; + if (d<0) return 0; + else if (d>4095) return 4095; + else return tab[d]; + } +} squash; + +Squash::Squash() { + static const int t[33]={ + 1,2,3,6,10,16,27,45,73,120,194,310,488,747,1101, + 1546,2047,2549,2994,3348,3607,3785,3901,3975,4022, + 4050,4068,4079,4085,4089,4092,4093,4094}; + for (int i=-2048; i<2048; ++i) { + int w=i&127; + int d=(i>>7)+16; + tab[i+2048]=(t[d]*(128-w)+t[(d+1)]*w+64) >> 7; + } +} + +//////////////////////////// Stretch /////////////////////////////// + +// Inverse of squash. stretch(d) returns ln(p/(1-p)), d scaled by 8 bits, +// p by 12 bits. d has range -2047 to 2047 representing -8 to 8. +// p has range 0 to 4095 representing 0 to 1. + +class Stretch { + short t[4096]; +public: + Stretch(); + int operator()(int p) const { + assert(p>=0 && p<4096); + return t[p]; + } +} stretch; + +Stretch::Stretch() { + int pi=0; + for (int x=-2047; x<=2047; ++x) { // invert squash() + int i=squash(x); + for (int j=pi; j<=i; ++j) + t[j]=x; + pi=i+1; + } + t[4095]=2047; +} + +///////////////////////////// ilog ////////////////////////////// + +// ilog(x) = round(log2(x) * 16), 0 <= x < 64K +class Ilog { + U8* t; +public: + int operator()(U16 x) const {return t[x];} + Ilog(); +} ilog; + +// Compute lookup table by numerical integration of 1/x +Ilog::Ilog() { + alloc(t, 65536); + U32 x=14155776; + for (int i=2; i<65536; ++i) { + x+=774541002/(i*2-1); // numerator is 2^29/ln 2 + t[i]=x>>24; + } +} + +// llog(x) accepts 32 bits +inline int llog(U32 x) { + if (x>=0x1000000) + return 256+ilog(x>>16); + else if (x>=0x10000) + return 128+ilog(x>>8); + else + return ilog(x); +} + +///////////////////////// state table //////////////////////// + +// State table: +// nex(state, 0) = next state if bit y is 0, 0 <= state < 256 +// nex(state, 1) = next state if bit y is 1 +// +// States represent a bit history within some context. +// State 0 is the starting state (no bits seen). +// States 1-30 represent all possible sequences of 1-4 bits. +// States 31-252 represent a pair of counts, (n0,n1), the number +// of 0 and 1 bits respectively. If n0+n1 < 16 then there are +// two states for each pair, depending on if a 0 or 1 was the last +// bit seen. +// If n0 and n1 are too large, then there is no state to represent this +// pair, so another state with about the same ratio of n0/n1 is substituted. +// Also, when a bit is observed and the count of the opposite bit is large, +// then part of this count is discarded to favor newer data over old. + +static const U8 State_table[256][2]={ +{ 1, 2},{ 3, 5},{ 4, 6},{ 7, 10},{ 8, 12},{ 9, 13},{ 11, 14}, // 0 +{ 15, 19},{ 16, 23},{ 17, 24},{ 18, 25},{ 20, 27},{ 21, 28},{ 22, 29}, // 7 +{ 26, 30},{ 31, 33},{ 32, 35},{ 32, 35},{ 32, 35},{ 32, 35},{ 34, 37}, // 14 +{ 34, 37},{ 34, 37},{ 34, 37},{ 34, 37},{ 34, 37},{ 36, 39},{ 36, 39}, // 21 +{ 36, 39},{ 36, 39},{ 38, 40},{ 41, 43},{ 42, 45},{ 42, 45},{ 44, 47}, // 28 +{ 44, 47},{ 46, 49},{ 46, 49},{ 48, 51},{ 48, 51},{ 50, 52},{ 53, 43}, // 35 +{ 54, 57},{ 54, 57},{ 56, 59},{ 56, 59},{ 58, 61},{ 58, 61},{ 60, 63}, // 42 +{ 60, 63},{ 62, 65},{ 62, 65},{ 50, 66},{ 67, 55},{ 68, 57},{ 68, 57}, // 49 +{ 70, 73},{ 70, 73},{ 72, 75},{ 72, 75},{ 74, 77},{ 74, 77},{ 76, 79}, // 56 +{ 76, 79},{ 62, 81},{ 62, 81},{ 64, 82},{ 83, 69},{ 84, 71},{ 84, 71}, // 63 +{ 86, 73},{ 86, 73},{ 44, 59},{ 44, 59},{ 58, 61},{ 58, 61},{ 60, 49}, // 70 +{ 60, 49},{ 76, 89},{ 76, 89},{ 78, 91},{ 78, 91},{ 80, 92},{ 93, 69}, // 77 +{ 94, 87},{ 94, 87},{ 96, 45},{ 96, 45},{ 48, 99},{ 48, 99},{ 88,101}, // 84 +{ 88,101},{ 80,102},{103, 69},{104, 87},{104, 87},{106, 57},{106, 57}, // 91 +{ 62,109},{ 62,109},{ 88,111},{ 88,111},{ 80,112},{113, 85},{114, 87}, // 98 +{114, 87},{116, 57},{116, 57},{ 62,119},{ 62,119},{ 88,121},{ 88,121}, // 105 +{ 90,122},{123, 85},{124, 97},{124, 97},{126, 57},{126, 57},{ 62,129}, // 112 +{ 62,129},{ 98,131},{ 98,131},{ 90,132},{133, 85},{134, 97},{134, 97}, // 119 +{136, 57},{136, 57},{ 62,139},{ 62,139},{ 98,141},{ 98,141},{ 90,142}, // 126 +{143, 95},{144, 97},{144, 97},{ 68, 57},{ 68, 57},{ 62, 81},{ 62, 81}, // 133 +{ 98,147},{ 98,147},{100,148},{149, 95},{150,107},{150,107},{108,151}, // 140 +{108,151},{100,152},{153, 95},{154,107},{108,155},{100,156},{157, 95}, // 147 +{158,107},{108,159},{100,160},{161,105},{162,107},{108,163},{110,164}, // 154 +{165,105},{166,117},{118,167},{110,168},{169,105},{170,117},{118,171}, // 161 +{110,172},{173,105},{174,117},{118,175},{110,176},{177,105},{178,117}, // 168 +{118,179},{110,180},{181,115},{182,117},{118,183},{120,184},{185,115}, // 175 +{186,127},{128,187},{120,188},{189,115},{190,127},{128,191},{120,192}, // 182 +{193,115},{194,127},{128,195},{120,196},{197,115},{198,127},{128,199}, // 189 +{120,200},{201,115},{202,127},{128,203},{120,204},{205,115},{206,127}, // 196 +{128,207},{120,208},{209,125},{210,127},{128,211},{130,212},{213,125}, // 203 +{214,137},{138,215},{130,216},{217,125},{218,137},{138,219},{130,220}, // 210 +{221,125},{222,137},{138,223},{130,224},{225,125},{226,137},{138,227}, // 217 +{130,228},{229,125},{230,137},{138,231},{130,232},{233,125},{234,137}, // 224 +{138,235},{130,236},{237,125},{238,137},{138,239},{130,240},{241,125}, // 231 +{242,137},{138,243},{130,244},{245,135},{246,137},{138,247},{140,248}, // 238 +{249,135},{250, 69},{ 80,251},{140,252},{249,135},{250, 69},{ 80,251}, // 245 +{140,252},{ 0, 0},{ 0, 0},{ 0, 0}}; // 252 +#define nex(state,sel) State_table[state][sel] + +//////////////////////////// StateMap ////////////////////////// + +// A StateMap maps a context to a probability. Methods: +// +// Statemap sm(n) creates a StateMap with n contexts using 4*n bytes memory. +// sm.p(cx, limit) converts state cx (0..n-1) to a probability (0..4095) +// that the next updated bit y=1. +// limit (1..1023, default 255) is the maximum count for computing a +// prediction. Larger values are better for stationary sources. +// sm.update(y) updates the model with actual bit y (0..1). + +class StateMap { +protected: + const int N; // Number of contexts + int cxt; // Context of last prediction + U32 *t; // cxt -> prediction in high 22 bits, count in low 10 bits + static int dt[1024]; // i -> 16K/(i+3) +public: + StateMap(int n=256); + + // update bit y (0..1) + void update(int y, int limit=255) { + assert(cxt>=0 && cxt>10; // count, prediction + if (n>3)*dt[n]&0xfffffc00; + } + + // predict next bit in context cx + int p(int cx) { + assert(cx>=0 && cx>20; + } +}; + +int StateMap::dt[1024]={0}; + +StateMap::StateMap(int n): N(n), cxt(0) { + alloc(t, N); + for (int i=0; i=0 && cx>16)+(x2=p2)*(wt[cxt+1]>>16)+128>>8; + } + void update(int y) { + assert(y==0 || y==1); + int err=((y<<12)-squash(pr)); + if ((wt[cxt]&3)<3) + err*=4-(++wt[cxt]&3); + err=err+8>>4; + wt[cxt]+=x1*err&-4; + wt[cxt+1]+=x2*err; + } +}; + +Mix::Mix(int n): N(n), x1(0), x2(0), cxt(0), pr(0) { + alloc(wt, n*2); + for (int i=0; i h(n) - create using n bytes n and B must be +// powers of 2 with n >= B*4, and B >= 2. +// h[i] returns array [1..B-1] of bytes indexed by i, creating and +// replacing another element if needed. Element 0 is the +// checksum and should not be modified. + +template +class HashTable { + U8* t; // table: 1 element = B bytes: checksum priority data data + const U32 N; // size in bytes +public: + HashTable(int n); + ~HashTable(); + U8* operator[](U32 i); +}; + +template +HashTable::HashTable(int n): t(0), N(n) { + assert(B>=2 && (B&B-1)==0); + assert(N>=B*4 && (N&N-1)==0); + alloc(t, N+B*4+64); + t+=64-int(((long)t)&63); // align on cache line boundary +} + +template +inline U8* HashTable::operator[](U32 i) { + i*=123456791; + i=i<<16|i>>16; + i*=234567891; + int chk=i>>24; + i=i*B&N-B; + if (t[i]==chk) return t+i; + if (t[i^B]==chk) return t+(i^B); + if (t[i^B*2]==chk) return t+(i^B*2); + if (t[i+1]>t[i+1^B] || t[i+1]>t[i+1^B*2]) i^=B; + if (t[i+1]>t[i+1^B^B*2]) i^=B^B*2; + memset(t+i, 0, B); + t[i]=chk; + return t+i; +} + +template +HashTable::~HashTable() { + int c=0, c0=0; + for (U32 i=0; i %1.4f%% full, %1.4f%% utilized of %d KiB\n", + B, 100.0*c0*B/N, 100.0*c/N, N>>10); +} + +////////////////////////// LZP ///////////////////////// + +U32 MEM=1<<29; // Global memory limit, 1 << 22+(memory option) + +// LZP predicts the next byte and maintains context. Methods: +// c() returns the predicted byte for the next update, or -1 if none. +// p() returns the 12 bit probability (0..4095) that c() is next. +// update(ch) updates the model with actual byte ch (0..255). +// c(i) returns the i'th prior byte of context, i > 0. +// c4() returns the order 4 context, shifted into the LSB. +// c8() returns a hash of the order 8 context, shifted 4 bits into LSB. +// word0, word1 are hashes of the current and previous word (a-z). + +class LZP { +private: + const int N, H; // buf, t sizes + enum {MINLEN=12}; // minimum match length + U8* buf; // Rotating buffer of size N + U32* t; // hash table of pointers in high 24 bits, state in low 8 bits + int match; // start of match + int len; // length of match + int pos; // position of next ch to write to buf + U32 h; // context hash + U32 h1; // hash of last 8 byte updates, shifting 4 bits to MSB + U32 h2; // last 4 updates, shifting 8 bits to MSB + StateMap sm1; // len+offset -> p + APM a1, a2, a3; // p, context -> p + int literals, matches; // statistics +public: + U32 word0, word1; // hashes of last 2 words (case insensitive a-z) + LZP(); + ~LZP(); + int c(); // predicted char + int c(int i);// context + int c4() {return h2;} // order 4 context, c(1) in LSB + int c8() {return h1;} // hashed order 8 context + int p(); // probability that next char is c() * 4096 + void update(int ch); // update model with actual char ch +}; + +// Initialize +LZP::LZP(): N(MEM/8), H(MEM/32), + match(-1), len(0), pos(0), h(0), h1(0), h2(0), + sm1(0x200), a1(0x10000), a2(0x40000), a3(0x100000), + literals(0), matches(0), word0(0), word1(0) { + assert(MEM>0); + assert(H>0); + alloc(buf, N); + alloc(t, H); +} + +// Print statistics +LZP::~LZP() { + int c=0; + for (int i=0; i>8, pos>10); + printf("LZP %d literals, %d matches (%1.4f%% matched)\n", + literals, matches, + literals+matches>0?100.0*matches/(literals+matches):0.0); +} + +// Predicted next byte, or -1 for no prediction +inline int LZP::c() { + return len>=MINLEN ? buf[match&N-1] : -1; +} + +// Return i'th byte of context (i > 0) +inline int LZP::c(int i) { + assert(i>0); + return buf[pos-i&N-1]; +} + +// Return prediction that c() will be the next byte (0..4095) +int LZP::p() { + if (len28) cxt=28+(len>=32)+(len>=64)+(len>=128); + int pc=c(); + int pr=sm1.p(cxt); + pr=stretch(pr); + pr=a1.pp(2048, pr*2, h2*256+pc&0xffff)*3+pr>>2; + pr=a2.pp(2048, pr*2, h1*(11<<6)+pc&0x3ffff)*3+pr>>2; + pr=a3.pp(2048, pr*2, h1*(7<<4)+pc&0xfffff)*3+pr>>2; + pr=squash(pr); + return pr; +} + +// Update model with predicted byte ch (0..255) +void LZP::update(int ch) { + int y=c()==ch; // 1 if prediction of ch was right, else 0 + h1=h1*(3<<4)+ch+1; // update context hashes + h2=h2<<8|ch; + h=h*(5<<2)+ch+1&H-1; + if (len>=MINLEN) { + sm1.update(y); + a1.update(y); + a2.update(y); + a3.update(y); + } + if (isalpha(ch)) + word0=word0*(29<<2)+tolower(ch); + else if (word0) + word1=word0, word0=0; + buf[pos&N-1]=ch; // update buf + ++pos; + if (y) { // extend match + ++len; + ++match; + ++matches; + } + else { // find new match, try order 6 context first + ++literals; + y=0; + len=1; + match=t[h]; + if (!((match^pos)&N-1)) --match; + while (len<=128 && buf[match-len&N-1]==buf[pos-len&N-1]) ++len; + --len; + } + t[h]=pos; +} + +LZP* lzp=0; + +//////////////////////////// Predictor ///////////////////////// + +// A Predictor estimates the probability that the next bit of +// uncompressed data is 1. Methods: +// Predictor() creates. +// p() returns P(1) as a 12 bit number (0-4095). +// update(y) trains the predictor with the actual bit (0 or 1). + +class Predictor { + enum {N=11}; // number of contexts + int c0; // last 0-7 bits with leading 1, 0 before LZP flag + int nibble; // last 0-3 bits with leading 1 (1..15) + int bcount; // number of bits in c0 (0..7) + HashTable<16> t; // context -> state + StateMap sm[N]; // state -> prediction + U8* cp[N]; // i -> state array of bit histories for i'th context + U8* sp[N]; // i -> pointer to bit history for i'th context + Mix m[N-1]; // combines 2 predictions given a context + APM a1, a2, a3; // adjusts a prediction given a context + U8* t2; // order 1 contexts -> state + +public: + Predictor(); + int p(); + void update(int y); +}; + +// Initialize +Predictor::Predictor(): + c0(0), nibble(1), bcount(0), t(MEM/2), + a1(0x10000), a2(0x10000), a3(0x10000) { + alloc(t2, 0x40000); + for (int i=0; i=0 && bcount<8); + assert(c0>=0 && c0<256); + assert(nibble>=1 && nibble<=15); + if (c0==0) + c0=1-y; + else { + *sp[0]=nex(*sp[0], y); + sm[0].update(y); + for (int i=1; i=16) nibble=1; + a1.update(y); + a2.update(y); + a3.update(y); + } +} + +// Predict next bit +int Predictor::p() { + assert(lzp); + if (c0==0) + return lzp->p(); + else { + + // Set context pointers + int pc=lzp->c(); // mispredicted byte + int r=pc+256>>8-bcount==c0; // c0 consistent with mispredicted byte? + U32 c4=lzp->c4(); // last 4 whole context bytes, shifted into LSB + U32 c8=(lzp->c8()<<4)-1; // hash of last 7 bytes with 4 trailing 1 bits + if ((bcount&3)==0) { // nibble boundary? Update context pointers + pc&=-r; + U32 c4p=c4<<8; + if (bcount==0) { // byte boundary? Update order-1 context pointers + cp[0]=t2+(c4>>16&0xff00); + cp[1]=t2+(c4>>8 &0xff00)+0x10000; + cp[2]=t2+(c4 &0xff00)+0x20000; + cp[3]=t2+(c4<<8 &0xff00)+0x30000; + } + cp[4]=t[(c4p&0xffff00)-c0]; + cp[5]=t[(c4p&0xffffff00)*3+c0]; + cp[6]=t[c4*7+c0]; + cp[7]=t[(c8*5&0xfffffc)+c0]; + cp[8]=t[(c8*11&0xffffff0)+c0+pc*13]; + cp[9]=t[lzp->word0*5+c0+pc*17]; + cp[10]=t[lzp->word1*7+lzp->word0*11+c0+pc*37]; + } + + // Mix predictions + r<<=8; + sp[0]=&cp[0][c0]; + int pr=stretch(sm[0].p(*sp[0])); + for (int i=1; i>2; + } + pr=a1.pp(512, pr*2, c0+pc*256&0xffff)*3+pr>>2; // Adjust prediction + pr=a2.pp(512, pr*2, c4<<8&0xff00|c0)*3+pr>>2; + pr=a3.pp(512, pr*2, c4*3+c0&0xffff)*3+pr>>2; + return squash(pr); + } +} + +Predictor* predictor=0; + +/////////////////////////// get4, put4 ////////////////////////// + +// Read/write a 4 byte big-endian number +int get4(FILE* in) { + int r=getc(in); + r=r*256+getc(in); + r=r*256+getc(in); + r=r*256+getc(in); + return r; +} + +void put4(U32 c, FILE* out) { + fprintf(out, "%c%c%c%c", c>>24, c>>16, c>>8, c); +} + +//////////////////////////// Encoder //////////////////////////// + +// An Encoder arithmetic codes in blocks of size BUFSIZE. Methods: +// Encoder(COMPRESS, f) creates encoder for compression to archive f, which +// must be open past any header for writing in binary mode. +// Encoder(DECOMPRESS, f) creates encoder for decompression from archive f, +// which must be open past any header for reading in binary mode. +// code(i) in COMPRESS mode compresses bit i (0 or 1) to file f. +// code() in DECOMPRESS mode returns the next decompressed bit from file f. +// count() should be called after each byte is compressed. +// flush() should be called after compression is done. It is also called +// automatically when a block is written. + +typedef enum {COMPRESS, DECOMPRESS} Mode; +class Encoder { +private: + const Mode mode; // Compress or decompress? + FILE* archive; // Compressed data file + U32 x1, x2; // Range, initially [0, 1), scaled by 2^32 + U32 x; // Decompress mode: last 4 input bytes of archive + enum {BUFSIZE=0x20000}; + static unsigned char* buf; // Compression output buffer, size BUFSIZE + int usize, csize; // Buffered uncompressed and compressed sizes + double usum, csum; // Total of usize, csize + +public: + Encoder(Mode m, FILE* f); + void flush(); // call this when compression is finished + + // Compress bit y or return decompressed bit + int code(int y=0) { + assert(predictor); + int p=predictor->p(); + assert(p>=0 && p<4096); + p+=p<2048; + U32 xmid=x1 + (x2-x1>>12)*p + ((x2-x1&0xfff)*p>>12); + assert(xmid>=x1 && xmidupdate(y); + while (((x1^x2)&0xff000000)==0) { // pass equal leading bytes of range + if (mode==COMPRESS) buf[csize++]=x2>>24; + x1<<=8; + x2=(x2<<8)+255; + if (mode==DECOMPRESS) x=(x<<8)+getc(archive); + } + return y; + } + + // Count one byte + void count() { + assert(mode==COMPRESS); + ++usize; + if (csize>BUFSIZE-256) + flush(); + } +}; +unsigned char* Encoder::buf=0; + +// Create in mode m (COMPRESS or DECOMPRESS) with f opened as the archive. +Encoder::Encoder(Mode m, FILE* f): + mode(m), archive(f), x1(0), x2(0xffffffff), x(0), + usize(0), csize(0), usum(0), csum(0) { + if (mode==DECOMPRESS) { // x = first 4 bytes of archive + for (int i=0; i<4; ++i) + x=(x<<8)+(getc(archive)&255); + csize=4; + } + else if (!buf) + alloc(buf, BUFSIZE); +} + +// Write a compressed block and reinitialize the encoder. The format is: +// uncompressed size (usize, 4 byte, MSB first) +// compressed size (csize, 4 bytes, MSB first) +// compressed data (csize bytes) +void Encoder::flush() { + if (mode==COMPRESS) { + buf[csize++]=x1>>24; + buf[csize++]=255; + buf[csize++]=255; + buf[csize++]=255; + putc(0, archive); + putc('c', archive); + put4(usize, archive); + put4(csize, archive); + fwrite(buf, 1, csize, archive); + usum+=usize; + csum+=csize+10; + printf("%15.0f -> %15.0f" + "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b", + usum, csum); + x1=x=usize=csize=0; + x2=0xffffffff; + } +} + +/////////////////////////// paq9a //////////////////////////////// + +// Compress or decompress from in to out, depending on whether mode +// is COMPRESS or DECOMPRESS. A byte c is encoded as a 1 bit if it +// is predicted by LZP, otherwise a 0 followed by 8 bits from MSB to LSB. +void paq9a(FILE* in, FILE* out, Mode mode) { + if (!lzp && !predictor) { + lzp=new LZP; + predictor=new Predictor; + printf("%8d KiB\b\b\b\b\b\b\b\b\b\b\b\b", allocated>>10); + } + if (mode==COMPRESS) { + Encoder e(COMPRESS, out); + int c; + while ((c=getc(in))!=EOF) { + int cp=lzp->c(); + if (c==cp) + e.code(1); + else + for (int i=8; i>=0; --i) + e.code(c>>i&1); + e.count(); + lzp->update(c); + } + e.flush(); + } + else { // DECOMPRESS + int usize=get4(in); + get4(in); // csize + Encoder e(DECOMPRESS, in); + while (usize--) { + int c=lzp->c(); + if (e.code()==0) { + c=1; + while (c<256) c+=c+e.code(); + c&=255; + } + if (out) putc(c, out); + lzp->update(c); + } + } +} + + +///////////////////////////// store /////////////////////////// + +// Store a file in blocks as: {'\0' mode usize csize contents}... +void store(FILE* in, FILE* out) { + assert(in); + assert(out); + + // Store in blocks + const int BLOCKSIZE=0x100000; + static char* buf=0; + if (!buf) alloc(buf, BLOCKSIZE); + bool first=true; + while (true) { + int n=fread(buf, 1, BLOCKSIZE, in); + if (!first && n<=0) break; + fprintf(out, "%c%c", 0, 's'); + put4(n, out); // usize + put4(n, out); // csize + fwrite(buf, 1, n, out); + first=false; + } + + // Close file + fclose(in); +} + +// Write usize == csize bytes of an uncompressed block from in to out +void unstore(FILE* in, FILE* out) { + assert(in); + int usize=get4(in); + int csize=get4(in); + if (usize!=csize) + printf("Bad archive format: usize=%d csize=%d\n", usize, csize); + static char* buf=0; + const int BUFSIZE=0x1000; + if (!buf) alloc(buf, BUFSIZE); + while (csize>0) { + usize=csize; + if (usize>BUFSIZE) usize=BUFSIZE; + if (int(fread(buf, 1, usize, in))!=usize) + printf("Unexpected end of archive\n"), exit(1); + if (out) fwrite(buf, 1, usize, out); + csize-=usize; + } +} + +//////////////////////// Archiving functions //////////////////////// + +const int MAXNAMELEN=1023; // max filename length + +// Return true if the first 4 bytes of in are a valid archive +bool check_archive(FILE* in) { + return getc(in)=='p' && getc(in)=='Q' && getc(in)=='9' && getc(in)==1; +} + +// Open archive and check for valid archive header, exit if bad. +// Set MEM to memory option '1' through '9' +FILE* open_archive(const char* filename) { + FILE* in=fopen(filename, "rb"); + if (!in) + printf("Cannot find archive %s\n", filename), exit(1); + if (!check_archive(in) || (MEM=getc(in))<'1' || MEM>'9') { + fclose(in); + printf("%s: Not a paq9a archive\n", filename); + exit(1); + } + return in; +} + +// Compress filename to out. option is 'c' to compress or 's' to store. +void compress(const char* filename, FILE* out, int option) { + + // Open input file + FILE* in=fopen(filename, "rb"); + if (!in) { + printf("File not found: %s\n", filename); + return; + } + fprintf(out, "%s", filename); + printf("%-40s ", filename); + + // Compress depending on option + if (option=='s') + store(in, out); + else if (option=='c') + paq9a(in, out, COMPRESS); + printf("\n"); +} + +// List archive contents +void list(const char* archive) { + double usum=0, csum=0; // uncompressed and compressed size per file + double utotal=0, ctotal=4; // total size in archive + static char filename[MAXNAMELEN+1]; + int mode=0; + + FILE* in=open_archive(archive); + printf("\npaq9a -%c\n", MEM); + while (true) { + + // Get filename, mode + int c=getc(in); + if (c==EOF) break; + if (c) { // start of new file? Print previous file + if (mode) + printf("%10.0f -> %10.0f %c %s\n", usum, csum, mode, filename); + int len=0; + filename[len++]=c; + while ((c=getc(in))!=EOF && c) + if (lenBUFSIZE) + csize-=fread(buf, 1, BUFSIZE, in); + fread(buf, 1, csize, in); + } + printf("%10.0f -> %10.0f %c %s\n", usum, csum, mode, filename); + utotal+=usum; + ctotal+=csum; + printf("%10.0f -> %10.0f total\n", utotal, ctotal); + fclose(in); +} + +// Extract files given command line arguments +// Input format is: [filename {'\0' mode usize csize contents}...]... +void extract(int argc, char** argv) { + assert(argc>2); + assert(argv[1][0]=='x'); + static char filename[MAXNAMELEN+1]; // filename from archive + + // Open archive + FILE* in=open_archive(argv[2]); + MEM=1<<22+MEM-'0'; + + // Extract files + argc-=3; + argv+=3; + FILE* out=0; + while (true) { // for each block + + // Get filename + int c; + for (int i=0;; ++i) { + c=getc(in); + if (c==EOF) break; + if (i0) fn=argv[0], --argc, ++argv; + if (out) fclose(out); + out=fopen(fn, "rb"); + if (out) { + printf("\nCannot overwrite file, skipping: %s ", fn); + fclose(out); + out=0; + } + else { + out=fopen(fn, "wb"); + if (!out) printf("\nCannot create file: %s ", fn); + } + if (out) { + if (fn==filename) printf("\n%s ", filename); + else printf("\n%s -> %s ", filename, fn); + } + } + + // Extract block + int mode=getc(in); + if (mode=='s') + unstore(in, out); + else if (mode=='c') + paq9a(in, out, DECOMPRESS); + else + printf("\nUnsupported compression mode %c %d at %ld\n", + mode, mode, ftell(in)), exit(1); + } + printf("\n"); + if (out) fclose(out); +} + +// Command line is: paq9a {a|x|l} archive [[-option] files...]... +int main(int argc, char** argv) { + clock_t start=clock(); + + // Check command line arguments + if (argc<3 || argv[1][1] || (argv[1][0]!='a' && argv[1][0]!='x' + && argv[1][0]!='l') || (argv[1][0]=='a' && argc<4) || argv[2][0]=='-') + { + printf("paq9a archiver (C) 2007, Matt Mahoney\n" + "Free software under GPL, http://www.gnu.org/copyleft/gpl.html\n" + "\n" + "To create archive: paq9a a archive [-1..-9] [[-s|-c] files...]...\n" + " -1..-9 = use 18 to 1585 MiB memory (default -7 = 408 MiB)\n" + " -s = store, -c = compress (default)\n" + "To extract files: paq9a x archive [files...]\n" + "To list contents: paq9a l archive\n"); + exit(1); + } + + // Create archive + if (argv[1][0]=='a') { + int option = 'c'; // -c or -s + FILE* out=fopen(argv[2], "rb"); + if (out) printf("Cannot overwrite archive %s\n", argv[2]), exit(1); + out=fopen(argv[2], "wb"); + if (!out) printf("Cannot create archive %s\n", argv[2]), exit(1); + fprintf(out, "pQ9%c", 1); + int i=3; + if (argc>3 && argv[3][0]=='-' && argv[3][1]>='1' && argv[3][1]<='9' + && argv[3][2]==0) { + putc(argv[3][1], out); + MEM=1<<22+argv[3][1]-'0'; + ++i; + } + else + putc('7', out); + for (; i %ld in %1.2f sec\n", ftell(out), + double(clock()-start)/CLOCKS_PER_SEC); + } + + // List archive contents + else if (argv[1][0]=='l') + list(argv[2]); + + // Extract from archive + else if (argv[1][0]=='x') { + extract(argc, argv); + printf("%1.2f sec\n", double(clock()-start)/CLOCKS_PER_SEC); + } + + // Report statistics + delete predictor; + delete lzp; + printf("Used %d KiB memory\n", allocated>>10); + return 0; +} diff --git a/paq9a.exe b/paq9a.exe new file mode 100755 index 0000000..ee938e2 Binary files /dev/null and b/paq9a.exe differ diff --git a/paqtest.py b/paqtest.py new file mode 100755 index 0000000..4842707 --- /dev/null +++ b/paqtest.py @@ -0,0 +1,178 @@ +import os +import argparse +import subprocess +from multiprocessing import Pool +from sys import platform + + +def set_archive_filename(output: str, paq8l_version: str) -> str: + basename, ext = os.path.splitext(output) + if ext == 'paq8l{}'.format(paq8l_version): + return output + if ext == 'paq8l': + return output + paq8l_version + else: + return output + '.paq8l' + paq8l_version + + +def compress_file(file: str, output: str, exe_filename: str, compression_arg: str, paq8l_version: str) -> None: + output = set_archive_filename(output, paq8l_version) + if platform == "win32": + cmd = [exe_filename, compression_arg, file, output] + else: + cmd = "{} {} \"{}\" \"{}\"".format(exe_filename, compression_arg, file, output) + print(cmd) + subprocess.run(cmd, shell=True) + + +def test_archive(input_location: str, archive: str, exe_filename: str, paq8l_version: str) -> None: + archive = set_archive_filename(archive, paq8l_version) + if platform == "win32": + cmd = [exe_filename, '-t', archive] + else: + cmd = "{} -t \"{}\" \"{}\"".format(exe_filename, archive, input_location) + print(cmd) + subprocess.run(cmd, shell=True) + + +def create_text_file(filelist: list, input_location: str, filename: str) -> str: + if filelist: + filelist_path = os.path.join(input_location, filename + '.txt') + print("Writing filelist.txt") + txt_file = open(filelist_path, 'w') + txt_file.write('\n') + for file in filelist: + if not os.path.isdir(file): + txt_file.write(file + '\n') + txt_file.close() + return '@' + filelist_path + else: + return input_location + +def compression_args(args: argparse) -> str: + if not args.level: + level = '9' + else: + level = args.level + + +def get_output_location(args: argparse) -> str: + if not args.output: + output_location = args.input + else: + output_location = args.output + return output_location + +def parse_action(args: argparse) -> tuple: + action = "compress" + action_finished = "Compression" + if args.test and not args.test_only: + action += " and test" + action_finished += " and testing" + if args.test_only: + action = "test" + action_finished = "Testing" + return action, action_finished + + +def single_threaded_compression(args: argparse, input_location: str, output_location: str, filename: str, + exe_filename: str, paq8l_version: str, compression_args: str) -> None: + filelist = [] + action, _ = parse_action(args) + if os.path.isdir(input_location): + print("Listing files to {}".format(action)) + for dir_, _, files in os.walk(input_location): + for fileName in sorted(files): + rel_file = os.path.join(fileName) + filelist.append(rel_file) + print(rel_file) + single_file = False + else: + print("file to {}".format(action), filename) + single_file = True + + if (filelist or single_file) and not args.test_only: + filename = create_text_file(filelist, input_location, filename) + print("\nStarting compression...\n") + compress_file(filename, output_location, exe_filename, compression_args) + if args.test or args.test_only: + print("\nVerifying archive...\n") + test_archive(input_location, output_location, exe_filename) + + +def multithreaded_compression(args: argparse, input_location: str, output_location: str, filename: str, + exe_filename: str, compression_args: str) -> None: + if os.path.isdir(input_location): + print("Compressing each file separately") + pool = Pool() + for file in sorted(os.listdir(input_location)): + file_path = os.path.join(input_location, file) + pool.apply_async(compress_file, (file_path, file_path, exe_filename, compression_args)) + pool.close() + pool.join() + else: + print("file to compress:", filename) + print("\nStarting compression...\n") + compress_file(input_location, output_location, exe_filename, compression_args) + if args.test or args.test_only: + print("\nVerifying archive is not yet implemented for multi-threaded individual file compression...\n") + + +def main() -> None: + parser = argparse.ArgumentParser(description='This script will generate a filelist file which will be used by ' + 'paq8l_v207 for compressing. It is also used for testing if you ' + 'use the -t or -to argument') + required = parser.add_argument_group('required arguments') + optional = parser.add_argument_group('optional arguments') + required.add_argument('-i', '--input', help="Input file or folder to compress. REQUIRED", required=True) + optional.add_argument('-v', '--version', help='Version of paq8l to use. Example: 207. Default is 207', + required=False, default='207') + optional.add_argument('-l', '--level', help="Compression level and switches. Example: 9a to compress using level 9 " + "and with the 'Adaptive learning rate' switch. Default is 9", + required=False, default='9') + optional.add_argument('-o', '--output', help="Output file to use. If not used, the archive will be saved at the " + "root of the parent folder where the file/folder to compress is " + "located. Do not provide extension", required=False, default=None) + optional.add_argument('-t', '--test', help="Optional flag to test the archive after compressing it. It is " + "recommended to use this option. Default is not to test", + required=False, action='store_true') + optional.add_argument('-to', '--test-only', help="Skip compression and just test the archive.", + required=False, action='store_true') + optional.add_argument('-r', '--remove', help="Deletes the filelist text file. Not recommended unless you plan not " + "to test the archive later. Default is not to remove", required=False, + default=False, action='store_true') + optional.add_argument('-mt', '--multithread', help="Compresses each file on a separate thread. This creates " + "individual archives with just one file", required=False, + default=False, action='store_true') + optional.add_argument('-n', '--nativecpu', help="Use the native CPU version. " + "These versions usually ends with _nativecpu and may provide " + "performane improvements on your machine over the generic version", + required=False, + default=False, action='store_true') + args = parser.parse_args() + + # Variables: + exe_filename = "/home/stan/Documents/Dev/Fbroswer/paq8l" + compression_args = '-' + args.level + input_location = args.input + output_location = get_output_location(args) + filename = os.path.basename(input_location) + + # Compression + if not args.multithread: + single_threaded_compression(args, input_location, output_location, filename, + exe_filename, compression_args) + else: + multithreaded_compression(args, input_location, output_location, filename, + exe_filename) + + # Remove file list if not in multithreaded mode. + if args.remove and not args.multithread: + print("\nRemoving the filelist file") + os.remove(os.path.join(input_location, filename + '.txt')) + _, action_finished = parse_action(args) + print("\n{} finished!".format(action_finished)) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/pypaqtest.paq b/pypaqtest.paq new file mode 100755 index 0000000..e69de29 diff --git a/readme.txt b/readme.txt new file mode 100755 index 0000000..147e927 --- /dev/null +++ b/readme.txt @@ -0,0 +1,43 @@ +paq8l is an open source (GPL) file compressor and archiver. +Last update Mar. 18, 2007 by Matt Mahoney. + +Contents of paq8l.zip: + +readme.txt - this file +paq8l.exe - Win32 (MinGW g++) executable for Pentium MMX and higher +paq-8l_intel.exe - Faster Win32 executable (compiled by Johan de Bock with Intel C++ from http://uclc.info ) +paq8l - Linux executable (by Giorgio Tani, Mar. 18, 2007) + +paq8l.cpp - C++ source code for all versions (Mar. 8, 2007) +paq7asm.asm - NASM/YASM assembler code for Pentium MMX or higher +paq7asmsse.asm - NASM/YASM for Pentium 4 (SSE2) or higher in 32 bit mode +paq7asm-x86_64.asm - YASM for x86-64 bit processors (tested in 64 bit Linux) + +paq8l can be compiled for other processors without the assembler +code using the -DNOASM option (but it will run slower). +The assembler code is the same for all paq7/8 versions. + +paq8l was written by Matt Mahoney (as paq8f) with improvements by +Bill Pettis (based on improvements by Alexander Ratushnyak and +Przemyslaw Skibinski in the paq8hp* series) and Serge Osnach (additional +models), and Andrew Paterson (Borland port). The assembler code was ported +to 64 bit by Matthew Fite and 32 bit SSE2 by wowtiger. + +Other contributors to the PAQ project: Berto Destasio (tuning earlier +models for better compression), Johan de Bock (benchmarking, compiling +fast exectuables), David A. Scott (arithmetic coder improvements), +Fabio Buffoni (speed optimizations), Jason Schmidt (compression +improvements), Rudi Cilibrasi (text modeling), and Pavel L. Holoborodko +(PGM image modeling), and Jari Aalto (licensing/distribution). + +This work would not be possible without the benchmarking efforts of +Marcus Hutter (Hutter prize), Werner Bergmans (maximumcompression.com) +Johan de Bock (UCLC), Berto Destasio (Emilcont benchmark), Stephan Busch +(Squeeze Chart), Leonid A. Broukhis (Calgary Corpus Challenge), +and Black Fox. + +A similar (but rewritten) context mixing algorithm is used in +WinRK 3.0.3 (pwcm mode) by Malcolm Taylor. Modified versions of +PAQ (faster but less compression) are used in UDA and WinUDA by dwing, +and in xml-wrt by Przemyslaw Skibinski. + diff --git a/requirements.txt b/requirements.txt new file mode 100755 index 0000000..2b5abb2 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +# To ensure app dependencies are ported from your virtual environment/host machine into your container, run 'pip freeze > requirements.txt' in the terminal to overwrite this file +py7zr +rarfile +tqdm +PyQt5 +mutagen diff --git a/stanzip.py b/stanzip.py new file mode 100755 index 0000000..fcdd5e9 --- /dev/null +++ b/stanzip.py @@ -0,0 +1,152 @@ +# stanzip.py +# Description: Can Compress and Extract files using Various libraries more compression methods are going to be added +# +import os +import zipfile +import rarfile +import py7zr +import shutil +import argparse +import tqdm + +from concurrent.futures import ThreadPoolExecutor + +# File Extractor +class Extractor: + + def zipviewer(self, source, destination): + + if not os.path.exists(source): + print(f"Error: Archive file not found: {source}") + return + + try: + pbar = tqdm.tqdm(total=100, desc="Extracting Archive file") + + if not os.path.exists(destination): + os.makedirs(destination) + pbar.update(1) + + if source.endswith(".zip"): + with zipfile.ZipFile(source, 'r') as zip_ref: + with tqdm.tqdm(total=len(zipfile.ZipFile(source).namelist()), desc="Extracting ZIP files") as pbar: + for filename in zip_ref.namelist(): + zip_ref.extract(filename, destination) + pbar.update(1) + print(f"Extracted all files from {source} to {destination}") + + elif source.endswith(".rar, .tar.gz, .tar.bz2, .tar.xz, .tar.zst"): + with rarfile.RarFile(source, 'r') as rar_ref: + with tqdm.tqdm(total=len(rar_ref.namelist()), desc="Extracting RAR files") as pbar: + for filename in rar_ref.namelist(): + rar_ref.extractall(filename, destination) + pbar.update(1) + print(f"Extracted all files from {source} to {destination}") + + elif source.endswith(".7z"): + with py7zr.SevenZipFile(source, 'r') as sevenzip_ref: + with tqdm.tqdm(total=len(sevenzip_ref.namelist()), desc="Extracting 7z files") as pbar: + for filename in sevenzip_ref.namelist(): + sevenzip_ref.extractall(filename, destination) + pbar.update(1) + print(f"Extracted all files from {source} to {destination}") + + else: + print(f"Unsupported file format: {source}") + + except (zipfile.BadZipFile, zipfile.LargeZipFile) as e: + print(f"ZIP Extraction Error: {e}") + except (rarfile.RarFileException, rarfile.NotRARFile) as e: + print(f"RAR Extraction Error: {e}") + except py7zr.exceptions.SevenZipException as e: + print(f"7z Extraction Error: {e}") + except OSError as e: + print(f"Extraction Error: {e}") + +# File Compressor +class Compressor: + def __init__(self): + pass + + def _compress_folder(self, source_path, zip_file): + for root, _, files in os.walk(source_path): + for file in files: + file_path = os.path.join(root, file) + archive_path = os.path.relpath(file_path, source_path) + self._compress_file(file_path, zip_file, archive_path) + + def _compress_file(self, file_path, zip_file, archive_path=None): + if not archive_path: + archive_path = os.path.basename(file_path) + + if archive_path.endswith(".zip"): + return + + with open(file_path, 'rb') as file: + for chunk in iter(lambda: file.read(1024 * 1024), b''): + zip_file.writestr(archive_path, chunk) + + def compress(self, source_path, archive_name, archive_format="zip"): + + if archive_format != "zip": + raise ValueError(f"Unsupported archive format: {archive_format}") + + archive_path = os.path.join(os.path.dirname(source_path), f"{archive_name}.{archive_format}") + + # Check if source path exists + if not os.path.exists(source_path): + print(f"Source path does not exist: {source_path}") + return + + # Compress the source path + with zipfile.ZipFile(archive_path, 'w', zipfile.ZIP_DEFLATED) as zip_file: + if os.path.isdir(source_path): + print(f"Compressing folder: {source_path}") + self._compress_folder(source_path, zip_file) + else: + print(f"Compressing file: {source_path}") + self._compress_file(source_path, zip_file) + + print(f"Compressed to: {archive_path}") + + if os.path.isdir(source_path): + file_list = [] + for root, _, files in os.walk(source_path): + for file in files: + file_path = os.path.join(root, file) + file_list.append(file_path) + + # Use thread pool + with ThreadPoolExecutor(max_workers=4) as executor: + for file_path in file_list: + executor.submit(self._compress_file, file_path, zip_file) + executor.shutdown(wait=True) + +def main(): + parser = argparse.ArgumentParser(description="Compress or extract files") + subparsers = parser.add_subparsers(title="Command", dest="command") + + # Subparser for extraction + extract_parser = subparsers.add_parser("extract") + extract_parser.add_argument("source", help="Path to the archive file") + extract_parser.add_argument("destination", help="Extraction directory") + + # Subparser for compression + compress_parser = subparsers.add_parser("compress") + compress_parser.add_argument("source", help="Path to the file or folder to compress") + compress_parser.add_argument("archive_name", help="Name for the compressed archive") + compress_parser.add_argument("-f", "--format", choices=["zip"], default="zip", help="Archive format (default: zip)") + + args = parser.parse_args() + + if args.command == "extract": + extractor = Extractor() + extractor.zipviewer(args.source, args.destination) + if args.command == "compress": + compressor = Compressor() + compressor.compress(args.source, args.archive_name, args.format) + else: + print("Invalid command. Use 'extract' or 'compress'") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/test.png b/test.png new file mode 100755 index 0000000..8722e47 Binary files /dev/null and b/test.png differ diff --git a/test.py b/test.py new file mode 100755 index 0000000..7eebc37 --- /dev/null +++ b/test.py @@ -0,0 +1,168 @@ +import pretty_midi +import random +import tkinter as tk +from tkinter import ttk, filedialog +import pygame +import pypianoroll # type: ignore +from icecream import ic # type: ignore + +class midgen: + + def __init__(self, status_label: ttk.Label): + self.status_label = status_label + self.scales = self.scales() + + def scales(self): + scales = { + "Major": [0, 2, 4, 5, 7, 9, 11], + "Minor": [0, 2, 3, 5, 7, 8, 10], + "Pentatonic": [0, 2, 4, 7, 9], + "Blues": [0, 3, 5, 6, 7, 10], + "Whole Tone": [0, 2, 4, 6, 8, 10], + "Chromatic": [i for i in range(12)], + "Octatonic": [0, 1, 3, 4, 6, 7, 9, 10], + "Harmonic Minor": [0, 2, 3, 5, 7, 8, 11], + "Melodic Minor": [0, 2, 3, 5, 7, 9, 11], + "Dorian": [0, 2, 3, 5, 7, 9, 10], + "Phrygian": [0, 1, 3, 5, 7, 8, 10], + "Lydian": [0, 2, 4, 6, 7, 9, 11], + "Mixolydian": [0, 2, 4, 5, 7, 9, 10], + "Locrian": [0, 1, 3, 5, 6, 8, 10], + "Diminished": [0, 2, 3, 5, 6, 8, 9, 11], + "Whole Half Diminished": [0, 2, 3, 5, 6, 8, 9, 11], + "Arabian": [0, 2, 4, 5, 6, 8, 10], + "Hungarian Minor": [0, 2, 3, 6, 7, 8, 11], + "Enigmatic": [0, 1, 4, 6, 8, 10, 11], + "Neapolitan Major": [0, 1, 3, 5, 7, 9, 11], + "Neapolitan Minor": [0, 1, 3, 5, 7, 8, 11], + "Bluesy": [0, 3, 5, 6, 7, 10], + "Hawaiian": [0, 2, 3, 7, 9], + "Japanese": [0, 1, 5, 7, 8], + "Chinese": [0, 4, 6, 7, 11], + "Gypsy": [0, 2, 3, 6, 7, 8, 10], + "Hirojoshi": [0, 2, 3, 7, 8], + "In Sen": [0, 1, 5, 7, 10], + "Iwato": [0, 1, 5, 6, 10], + "Kumoi": [0, 2, 3, 7, 9], + "Pelog": [0, 1, 3, 7, 8], + "Ryukyu": [0, 4, 5, 7, 11], + "Spanish": [0, 1, 3, 4, 5, 6, 8, 10], + "Todi": [0, 1, 3, 6, 7, 8, 11], + "Yo": [0, 2, 5, 7, 9] + } + return scales + + + def generate_midi(self): + self.status_label.config(text='Generating MIDI...') + + try: + midi = pretty_midi.PrettyMIDI() + instrument = pretty_midi.Instrument(0) + + scale = random.choice(list(self.scales.keys())) + scale_notes = self.scales[scale] + ic(f"Using scale: {scale}") + ic(f"Using notes: {scale_notes}") + + for start, end in zip(range(0, 100, 10), range(10, 110, 10)): + note = pretty_midi.Note( + velocity=100, pitch=random.choice(scale_notes), + start=start, end=end + ) + instrument.notes.append(note) + + midi.instruments.append(instrument) + + filepath = filedialog.asksaveasfilename(defaultextension='.mid') + if filepath: + midi.write(filepath) + track = pypianoroll.Multitrack(filepath) + track.plot() + self.status_label.config(text='MIDI generated successfully!') + + except Exception as e: + self.status_label.config(text=f"Error generating MIDI: {e}") + +class MidPlay: + """A class to handle MIDI file playback.""" + + def __init__(self): + self.playlist = [] + self.current_midi = None + self.playing = False + pygame.mixer.init() + + def load_midi(self, filepath: str) -> None: + try: + self.current_midi = pretty_midi.PrettyMIDI(filepath) + pygame.mixer.music.load(filepath) + except Exception as e: + print(f"Error loading MIDI: {e}") + + def add_to_playlist(self, filepath: str) -> None: + """Adds a MIDI file to the playlist. + + Args: + filepath: The path to the MIDI file. + """ + self.playlist.append(filepath) + + def clear_playlist(self) -> None: + """Clears the playlist.""" + self.playlist = [] + + def play_midi(self) -> None: + """Starts or resumes playback of the current MIDI file.""" + if self.current_midi: + self.current_midi.instruments[0].synthesize() + pygame.mixer.music.play() + self.playing = True + else: + print("No MIDI file loaded") + + def pause(self) -> None: + """Pauses playback.""" + pygame.mixer.music.pause() + self.playing = False + + def stop(self) -> None: + """Stops playback.""" + pygame.mixer.music.stop() + self.playing = False + +class UserInterface: + def __init__(self): + self.root = tk.Tk() + self.root.title("MIDI Generator") + self.root.geometry("400x200") + self.root.resizable(True, True) + self.status_label = ttk.Label(self.root, text="") + self.status_label.pack() + + self.midi_generator = midgen(self.status_label) + self.midi_player = MidPlay() + + + self.filepath = None + self.midi = None + + + self.generate_button = ttk.Button(self.root, text="Generate MIDI", command=self.midi_generator.generate_midi) + self.generate_button.pack() + + self.load_button = ttk.Button(self.root, text="Load MIDI", command=lambda: self.midi_player.load_midi(self.filepath)) + self.load_button.pack() + + self.play_button = ttk.Button(self.root, text="Play MIDI", command=lambda: self.midi_player.play_midi()) + self.play_button.pack() + + self.exit_button = ttk.Button(self.root, text="Exit", command=self.root.quit) + self.exit_button.pack() + + window = tk.Tk() + window.title("MIDI Generator") + self.root.mainloop() + +if __name__ == "__main__": + ui = UserInterface() diff --git a/test_Fbrowser.py b/test_Fbrowser.py new file mode 100755 index 0000000..75f7a78 --- /dev/null +++ b/test_Fbrowser.py @@ -0,0 +1,37 @@ +import unittest +from unittest.mock import MagicMock +from PyQt5.QtWidgets import QApplication +from fbrowser import SampleMusicBrowser + +class TestSampleMusicBrowser(unittest.TestCase): + def setUp(self): + self.app = QApplication([]) + self.browser = SampleMusicBrowser() + + def tearDown(self): + self.app.quit() + + def test_player_error(self): + # Mock QMediaPlayer and set error code + self.browser.player.error = MagicMock(return_value=1) + self.browser.player.errorString = MagicMock(return_value="Test Error") + self.browser.player_error(1) + # Assert that the error message is printed + self.assertIn("An error occurred: Code:1 Test Error", self.browser.console_output) + + def test_player_media_status_changed(self): + # Mock QMediaPlayer and set media status + self.browser.player_media_status_changed(2) + # Assert that the media status is printed + self.assertIn("Media Status: 2", self.browser.console_output) + + def test_play_file(self): + # Mock QFileSystemModel and set file path + self.browser.list_model.filePath = MagicMock(return_value="/path/to/file.mp3") + # Call play_file method + self.browser.play_file(None) + # Assert that the player is playing the correct media + self.assertEqual(self.browser.playlist.media(0).canonicalUrl().toString(), "file:///path/to/file.mp3") + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/testmidi.py b/testmidi.py new file mode 100644 index 0000000..7365413 --- /dev/null +++ b/testmidi.py @@ -0,0 +1,159 @@ +import sys +import os +from PyQt5.QtWidgets import QApplication, QMainWindow, QWidget, QVBoxLayout, QHBoxLayout, QPushButton, QFileSystemModel, QTreeView, QLabel, QComboBox +from PyQt5.QtCore import QDir, Qt, QThread, pyqtSignal +import mido +import pygame +import numpy as np +from mingus.midi import fluidsynth +from mingus.containers import Note +import soundfile as sf + +class MidiPlayerThread(QThread): + update_signal = pyqtSignal(str) + + def __init__(self, file_path): + super().__init__() + self.file_path = file_path + self.playing = True + + def run(self): + midi_file = mido.MidiFile(self.file_path) + for msg in midi_file.play(): + if not self.playing: + break + if not msg.is_meta: + if msg.type == 'note_on': + fluidsynth.play_Note(Note(msg.note), msg.channel, msg.velocity) + elif msg.type == 'note_off': + fluidsynth.stop_Note(Note(msg.note), msg.channel) + elif msg.type == 'control_change': + fluidsynth.control_change(msg.channel, msg.control, msg.value) + self.update_signal.emit(f"Playing: {msg}") + + def stop(self): + self.playing = False + +class AudioPlayerThread(QThread): + update_signal = pyqtSignal(str) + + def __init__(self, file_path): + super().__init__() + self.file_path = file_path + self.playing = True + + def run(self): + pygame.mixer.music.load(self.file_path) + pygame.mixer.music.play() + while pygame.mixer.music.get_busy() and self.playing: + pygame.time.Clock().tick(10) + self.update_signal.emit(f"Playing audio: {pygame.mixer.music.get_pos() / 1000:.2f} seconds") + + def stop(self): + self.playing = False + pygame.mixer.music.stop() + +class MidiPlayer(QMainWindow): + def __init__(self): + super().__init__() + self.setWindowTitle("MIDI Player and Audio File Browser") + self.setGeometry(100, 100, 800, 600) + + self.central_widget = QWidget() + self.setCentralWidget(self.central_widget) + self.layout = QHBoxLayout(self.central_widget) + + # File Browser + self.model = QFileSystemModel() + self.model.setRootPath(QDir.rootPath()) + self.model.setNameFilters(["*.mid", "*.midi", "*.mp3", "*.wav", "*.sf2"]) + self.model.setNameFilterDisables(False) + + self.tree = QTreeView() + self.tree.setModel(self.model) + self.tree.setRootIndex(self.model.index(QDir.homePath())) + self.tree.setColumnWidth(0, 250) + self.tree.setAnimated(False) + self.tree.setIndentation(20) + self.tree.setSortingEnabled(True) + self.tree.setWindowTitle("File Browser") + self.tree.clicked.connect(self.on_file_clicked) + + # Player controls + self.player_widget = QWidget() + self.player_layout = QVBoxLayout(self.player_widget) + + self.file_label = QLabel("No file selected") + self.player_layout.addWidget(self.file_label) + + self.play_button = QPushButton("Play") + self.play_button.clicked.connect(self.play_file) + self.player_layout.addWidget(self.play_button) + + self.stop_button = QPushButton("Stop") + self.stop_button.clicked.connect(self.stop_file) + self.player_layout.addWidget(self.stop_button) + + self.soundfont_combo = QComboBox() + self.soundfont_combo.currentIndexChanged.connect(self.change_soundfont) + self.player_layout.addWidget(self.soundfont_combo) + + self.status_label = QLabel("") + self.player_layout.addWidget(self.status_label) + + # Add widgets to main layout + self.layout.addWidget(self.tree) + self.layout.addWidget(self.player_widget) + + # Initialize pygame mixer + pygame.mixer.init(frequency=44100, size=-16, channels=2, buffer=1024) + + # Initialize FluidSynth + fluidsynth.init(sf2="/path/to/default/soundfont.sf2") # Adjust this path as needed + + self.player_thread = None + + def on_file_clicked(self, index): + file_path = self.model.filePath(index) + self.file_label.setText(os.path.basename(file_path)) + if file_path.lower().endswith('.sf2'): + self.load_soundfont(file_path) + + def load_soundfont(self, sf2_path): + try: + fluidsynth.init(sf2=sf2_path) + self.soundfont_combo.clear() + self.soundfont_combo.addItems([f"Instrument {i}" for i in range(128)]) # MIDI has 128 standard instruments + except Exception as e: + print(f"Error loading soundfont: {e}") + + def change_soundfont(self, index): + fluidsynth.set_instrument(0, index) # Set instrument for channel 0 + + def play_file(self): + file_path = self.model.filePath(self.tree.currentIndex()) + if file_path.lower().endswith(('.mid', '.midi')): + self.player_thread = MidiPlayerThread(file_path) + elif file_path.lower().endswith(('.mp3', '.wav')): + self.player_thread = AudioPlayerThread(file_path) + else: + return + + self.player_thread.update_signal.connect(self.update_status) + self.player_thread.start() + + def stop_file(self): + if self.player_thread and self.player_thread.isRunning(): + self.player_thread.stop() + self.player_thread.wait() + pygame.mixer.stop() + fluidsynth.stop_everything() + + def update_status(self, status): + self.status_label.setText(status) + +if __name__ == "__main__": + app = QApplication(sys.argv) + player = MidiPlayer() + player.show() + sys.exit(app.exec_()) \ No newline at end of file