Module `xingyun.savecode.get_code`

Expand source code

from pathlib import Path
import re
from typing import Callable
import fnmatch
from pathlib import PurePosixPath
from xingyun.universal.convert import convert_size_to_bytes

def filter_default(filename: str):
    '''filter file based on file name.'''

    return re.search(r"((\.py)|(\.sh)|(\.txt))$", filename) is not None


def filter_hidden(filename: str):
    '''skip files that are start with "." .'''

    return re.search(r"(/|^)\.[^\\]", filename) is None

def filter_gitignore(filename: str):
    '''filter out files that are in .gitignore'''

    p = Path(".gitignore")
    if p.exists():
        with open(p, "r") as fil:
            ignore_files = fil.read().split("\n")
        for to_ignore in ignore_files:
            if fnmatch.fnmatch(filename, to_ignore):
                return False
    return True

def get_code(
        filters: list[ Callable[[str], bool] ] = [filter_default, filter_gitignore, filter_hidden], 
        sizelimit : int | str = "1mb" , 
        total_sizelimit  : int | str = "500mb" , 
        path: str = ".", 
    ) -> dict[str, str]:
    '''Get all code files under a dictionary.
    
    ### Parameters
        - path: path to start traverse.
        - pattern: a regular expression to match files.
        - sizelimit: only save files that are lower than size limit.
    '''
    p = Path(path)
    if isinstance(sizelimit , str):
        sizelimit = convert_size_to_bytes(sizelimit)
    if isinstance(total_sizelimit , str):
        total_sizelimit = convert_size_to_bytes(total_sizelimit)

    saved = {}
    acc_size = 0 # accumulated size
    for file in p.rglob("*"):

        # ensure file
        if not file.is_file():
            continue 

        # get relative path
        rel_path = str( PurePosixPath(file.relative_to(p)) )

        if rel_path == "__init__.py":
            import pdb;pdb.set_trace()

        # apply filters
        flag_skip = False
        for filter in filters: # skip if can not pass all filters
            if not filter(rel_path):
                flag_skip = True
        if flag_skip:
            continue


        # apply filesize limit
        file_size = file.stat().st_size
        if file_size > sizelimit: # skip if too large 
            continue

        acc_size = acc_size + file_size
        if acc_size > total_sizelimit: # break if total too large
            break

        # save file content
        with open(file, "rb") as fil:
            content = fil.read()

        try:
            content = content.decode("utf-8")
        except:
            pass
        
        saved[rel_path] = content

    return saved

def compare_dict(dict_1: dict[str, str] | None, dict_2: dict[str, str] | None):
    '''compare if two str dicts are exactly the same.'''

    if (dict_1 is None) or (dict_2 is None):
        return (dict_1 is None) and (dict_2 is None)
    
    names = list( set(dict_1) | set(dict_2) )
    names.sort()
    hash_1 = "##".join([str( dict_1.get(x) ) for x in names])
    hash_2 = "##".join([str( dict_2.get(x) ) for x in names])
    return hash_1 == hash_2

Functions

def compare_dict(dict_1: dict[str, str] | None, dict_2: dict[str, str] | None)

compare if two str dicts are exactly the same.

Expand source code

def compare_dict(dict_1: dict[str, str] | None, dict_2: dict[str, str] | None):
    '''compare if two str dicts are exactly the same.'''

    if (dict_1 is None) or (dict_2 is None):
        return (dict_1 is None) and (dict_2 is None)
    
    names = list( set(dict_1) | set(dict_2) )
    names.sort()
    hash_1 = "##".join([str( dict_1.get(x) ) for x in names])
    hash_2 = "##".join([str( dict_2.get(x) ) for x in names])
    return hash_1 == hash_2

def filter_default(filename: str)

filter file based on file name.

Expand source code

def filter_default(filename: str):
    '''filter file based on file name.'''

    return re.search(r"((\.py)|(\.sh)|(\.txt))$", filename) is not None

def filter_gitignore(filename: str)

filter out files that are in .gitignore

Expand source code

def filter_gitignore(filename: str):
    '''filter out files that are in .gitignore'''

    p = Path(".gitignore")
    if p.exists():
        with open(p, "r") as fil:
            ignore_files = fil.read().split("\n")
        for to_ignore in ignore_files:
            if fnmatch.fnmatch(filename, to_ignore):
                return False
    return True

def filter_hidden(filename: str)

skip files that are start with "." .

Expand source code

def filter_hidden(filename: str):
    '''skip files that are start with "." .'''

    return re.search(r"(/|^)\.[^\\]", filename) is None

def get_code(filters: list[typing.Callable[[str], bool]] = [<function filter_default>, <function filter_gitignore>, <function filter_hidden>], sizelimit: int | str = '1mb', total_sizelimit: int | str = '500mb', path: str = '.') ‑> dict[str, str]

Get all code files under a dictionary.

Parameters

- path: path to start traverse.
- pattern: a regular expression to match files.
- sizelimit: only save files that are lower than size limit.

Expand source code

def get_code(
        filters: list[ Callable[[str], bool] ] = [filter_default, filter_gitignore, filter_hidden], 
        sizelimit : int | str = "1mb" , 
        total_sizelimit  : int | str = "500mb" , 
        path: str = ".", 
    ) -> dict[str, str]:
    '''Get all code files under a dictionary.
    
    ### Parameters
        - path: path to start traverse.
        - pattern: a regular expression to match files.
        - sizelimit: only save files that are lower than size limit.
    '''
    p = Path(path)
    if isinstance(sizelimit , str):
        sizelimit = convert_size_to_bytes(sizelimit)
    if isinstance(total_sizelimit , str):
        total_sizelimit = convert_size_to_bytes(total_sizelimit)

    saved = {}
    acc_size = 0 # accumulated size
    for file in p.rglob("*"):

        # ensure file
        if not file.is_file():
            continue 

        # get relative path
        rel_path = str( PurePosixPath(file.relative_to(p)) )

        if rel_path == "__init__.py":
            import pdb;pdb.set_trace()

        # apply filters
        flag_skip = False
        for filter in filters: # skip if can not pass all filters
            if not filter(rel_path):
                flag_skip = True
        if flag_skip:
            continue


        # apply filesize limit
        file_size = file.stat().st_size
        if file_size > sizelimit: # skip if too large 
            continue

        acc_size = acc_size + file_size
        if acc_size > total_sizelimit: # break if total too large
            break

        # save file content
        with open(file, "rb") as fil:
            content = fil.read()

        try:
            content = content.decode("utf-8")
        except:
            pass
        
        saved[rel_path] = content

    return saved