diff --git a/README.md b/README.md index 6244340..789c0b4 100644 --- a/README.md +++ b/README.md @@ -36,10 +36,12 @@ By using this template, your data science project is auto-generated as follows: ``` . |-- notebooks # A directory to place all notebooks files. -| `-- *.ipynb +| |-- *.ipynb +| `-- my_nb_path.py # Imported by *.ipynb to treat src/ as PYTHONPATH |-- setup.py # To pip install your Python module (if module name specified to cookiecutter) |-- src | |-- my_custom_module # Your custom module +| |-- my_nb_color.py # Imported by *.ipynb to colorize their outputs | `-- source_dir # You can further create this subdir for SageMaker entrypoint scripts |-- tests # Unit tests diff --git a/cookiecutter.json b/cookiecutter.json index 2014bb2..893409e 100644 --- a/cookiecutter.json +++ b/cookiecutter.json @@ -8,9 +8,5 @@ "MIT License", "Apache-2.0 License" ], - "python_interpreter": [ - "python3", - "python" - ], "package_name": "" } diff --git a/hooks/post_gen_project.py b/hooks/post_gen_project.py index 4916e98..38c047e 100644 --- a/hooks/post_gen_project.py +++ b/hooks/post_gen_project.py @@ -50,7 +50,7 @@ def rm(s: Path) -> None: message.append("# - review LICENSE") if package_name != "": message += [ - "# - review and update setup.py, then remove the exception at the end.", + "# - review and update setup.py, then remove the exception at the start.", "# - consider to adopt versioneer to version your package.", ] message += [ diff --git a/{{cookiecutter.repo_name}}/README.md b/{{cookiecutter.repo_name}}/README.md index 92708f6..209d8f9 100644 --- a/{{cookiecutter.repo_name}}/README.md +++ b/{{cookiecutter.repo_name}}/README.md @@ -8,14 +8,16 @@ {{cookiecutter.repo_name}} |-- bin # CLI scripts |-- notebooks -| `-- *.ipynb # Jupyter notebooks +| |-- *.ipynb # Jupyter notebooks +| `-- my_nb_path.py # Imported by *.ipynb to treat src/ as PYTHONPATH {% if cookiecutter.package_name != "" -%} |-- setup.py # To install {{cookiecutter.repo_name}} as a Python module {% endif -%} |-- src # Python modules developed in this project {% if cookiecutter.package_name != "" -%} -| `-- {{cookiecutter.repo_name}} +| |-- {{cookiecutter.repo_name}} {% endif -%} +| `-- my_nb_color.py # Imported by *.ipynb to colorize their outputs `-- tests # Unit tests # Miscellaneous files diff --git a/{{cookiecutter.repo_name}}/ipython_config.py b/{{cookiecutter.repo_name}}/ipython_config.py deleted file mode 100644 index d97c322..0000000 --- a/{{cookiecutter.repo_name}}/ipython_config.py +++ /dev/null @@ -1,38 +0,0 @@ -import os -import subprocess -from pathlib import Path -from typing import Union - -#################################################################################################### -# Additional PYTHONPATH to allow notebooks to import custom modules at a few pre-defined places. - - -def sys_path_append(o: Union[str, os.PathLike]) -> str: - posix_path: str = o.as_posix() if isinstance(o, Path) else Path(o).as_posix() - return 'sys.path.insert(0, "{}")'.format(posix_path) - - -_pythonpath = [ - "import sys, os", - sys_path_append(os.getcwd()), -] - -# Add GIT_ROOT/ and a few other subdirs -try: - _p = subprocess.run( - ["git", "rev-parse", "--show-toplevel"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT - ) - - if _p.returncode == 0: - _git_root: str = _p.stdout[:-1].decode("utf-8") # Remove trailing '\n' - _git_root_p: Path = Path(_git_root) - _pythonpath += [ - sys_path_append(_git_root_p), # GIT_ROOT - sys_path_append(_git_root_p / "src"), # GIT_ROOT/src - sys_path_append(_git_root_p / "notebooks"), # GIT_ROOT/notebooks - ] -except: # noqa: E722 - pass - -c.InteractiveShellApp.exec_lines = _pythonpath # type: ignore # noqa: F821 -#################################################################################################### diff --git a/{{cookiecutter.repo_name}}/notebooks/my_nb_path.py b/{{cookiecutter.repo_name}}/notebooks/my_nb_path.py new file mode 100644 index 0000000..0571245 --- /dev/null +++ b/{{cookiecutter.repo_name}}/notebooks/my_nb_path.py @@ -0,0 +1,69 @@ +"""Allow notebooks to import custom modules at a few pre-defined places within this project's +git repository. + +When imported, adds ``GITROOT``, ``GITROOT/src``, and ``GITROOT/notebooks`` to `sys.path`. + +Place this file in the same directory as your ``.ipynb`` files. If ``.ipynb`` files are organized +into subfolders, please ensure this file is presented in each subfolder. Example: + +.. code-block:: bash + + GITROOT + |-- .git # Signify this is a git repository + |-- notebooks # Parent folder of Jupyter notebooks + | |-- folder-a + | | |-- my_nb_path.py # Importable by nb-abc.ipynb and nb-xyz.ipynb + | | |-- nb-abc.ipynb + | | `-- nb-xyz.ipynb + | |-- my_nb_path.py # Importable by nb-01.ipynb and nb-02.ipynb + | |-- nb-01.ipynb + | `-- nb-02.ipynb + `-- src + `-- my_custom_module + |-- __init__.py + `-- ... + +Usage by ``.ipynb``: + + >>> # Allow this notebook to import from GITROOT, GITROOT/src, and GITROOT/notebooks. + >>> # This module must be imported before importing any other custom modules under GITROOT. + >>> # The isort directive prevents the statement to be moved around when isort is used. + >>> import my_nb_path # isort: skip + >>> + >>> # Test-drive importing a custom module under GITROOT/src. + >>> import my_custom_module + +Background: we used to rely on ``ipython_config.py`` in the current working directory. However, +IPython 8.0.1+, 7.31.1+ and 5.11+ disable this behavior for security reason as described +[here](https://ipython.readthedocs.io/en/stable/whatsnew/version8.html#ipython-8-0-1-cve-2022-21699). + +So now, each ``.ipynb`` must explicitly modify its own `sys.path` which is what this module offers +as convenience. +""" +import os +import subprocess +import sys +from pathlib import Path +from typing import Union + +def sys_path_append(o: Union[str, os.PathLike]) -> None: + posix_path: str = o.as_posix() if isinstance(o, Path) else Path(o).as_posix() + if posix_path not in sys.path: + sys.path.insert(0, posix_path) + +# Add GIT_ROOT/ and a few other subdirs +_p = subprocess.run( + ["git", "rev-parse", "--show-toplevel"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT +) + +if _p.returncode == 0: + _git_root: str = _p.stdout[:-1].decode("utf-8") # Remove trailing '\n' + _git_root_p = Path(_git_root) + + my_sys_paths = [ + _git_root_p, + _git_root_p / "src", + _git_root_p / "notebooks", + ] + for sp in my_sys_paths: + sys_path_append(sp) diff --git a/{{cookiecutter.repo_name}}/notebooks/skeleton.ipynb b/{{cookiecutter.repo_name}}/notebooks/skeleton.ipynb index 11d7c3c..4ec4031 100644 --- a/{{cookiecutter.repo_name}}/notebooks/skeleton.ipynb +++ b/{{cookiecutter.repo_name}}/notebooks/skeleton.ipynb @@ -13,8 +13,9 @@ "- Best viewed using Jupyter Lab.\n", "- The title is a styled sentence rather than `h1`, to prevent it being showed and numbered in TOC.\n", "\n", - "
NOTE: this skeleton notebook is primarily for reading. To run it\n", - "completely, you need to install additional dependencies imported in the cell below.

" + "**NOTE:** this skeleton notebook is meant for reading. To run it,\n", + "please install additional dependencies imported in the second next cell which starts with line\n", + "`# Dependencies required`." ] }, { @@ -28,12 +29,24 @@ "%load_ext autoreload\n", "%autoreload 2\n", "\n", - "# Follow isort>=5 style: 'import ...' statements before 'from ... import ...'.\n", + "# Make sure my_nb_path is imported first (and when isort is used, it needs to be told).\n", + "import my_nb_path # isort: skip\n", + "from my_nb_color import print, rprint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Dependencies required\n", "import ndpretty\n", "import numpy as np\n", "import pandas as pd\n", "import sagemaker as sm\n", "from IPython.display import Markdown\n", + "from loguru import logger\n", "from smallmatter.ds import mask_df # See: https://github.com/aws-samples/smallmatter-package/\n", "\n", "# A few standard SageMaker's stanzas. Use type annotation to be verbose.\n", @@ -145,6 +158,67 @@ "# Improved output" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Colored outputs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Colored: \u001b[1m{\u001b[0m\u001b[32m'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA'\u001b[0m, \u001b[32m'BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB'\u001b[0m\u001b[1m}\u001b[0m\n", + "Colored and wrapped:\n", + "\u001b[1m{\u001b[0m\n", + " \u001b[32m'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\u001b[0m\n", + "\u001b[32mAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\u001b[0m\n", + "\u001b[32mAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA'\u001b[0m,\n", + " \u001b[32m'BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB\u001b[0m\n", + "\u001b[32mBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB\u001b[0m\n", + "\u001b[32mBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB'\u001b[0m\n", + "\u001b[1m}\u001b[0m\n", + "\n", + "\u001b[1m{\u001b[0m\n", + " \u001b[32m'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA'\u001b[0m,\n", + " \u001b[32m'BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB'\u001b[0m\n", + "\u001b[1m}\u001b[0m\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[32m2022-01-22 17:23:03.529\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m\u001b[0m:\u001b[36m7\u001b[0m - \u001b[34m\u001b[1mHello World!\u001b[0m\n", + "\u001b[32m2022-01-22 17:23:03.530\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m\u001b[0m:\u001b[36m7\u001b[0m - \u001b[1mHello World!\u001b[0m\n", + "\u001b[32m2022-01-22 17:23:03.531\u001b[0m | \u001b[32m\u001b[1mSUCCESS \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m\u001b[0m:\u001b[36m7\u001b[0m - \u001b[32m\u001b[1mHello World!\u001b[0m\n", + "\u001b[32m2022-01-22 17:23:03.532\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m\u001b[0m:\u001b[36m7\u001b[0m - \u001b[31m\u001b[1mHello World!\u001b[0m\n" + ] + } + ], + "source": [ + "d = {\"A\" * 200, \"B\" * 200}\n", + "print(\"Colored:\", d)\n", + "rprint(\"Colored and wrapped:\", d)\n", + "display(d)\n", + "\n", + "for f in (logger.debug, logger.info, logger.success, logger.error):\n", + " f(\"Hello World!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Dataframes" + ] + }, { "cell_type": "code", "execution_count": null, @@ -153,8 +227,8 @@ { "data": { "text/markdown": [ - "## Plain dataframe\n", - "**NOTE:** this also appears in TOC as \"*2.1. Plain dataframe*\"" + "### Plain dataframe\n", + "**NOTE:** this also appears in TOC as \"*2.2.1. Plain dataframe*\"" ], "text/plain": [ "" @@ -221,7 +295,7 @@ { "data": { "text/markdown": [ - "## Masked dataframe\n", + "### Masked dataframe\n", "Sometime, we would like to version the output of this cell into the git repo, to help readers to\n", "quickly see the shape of a dataframe.\n", "\n", @@ -310,17 +384,19 @@ ")\n", "df_b = pd.DataFrame(\n", " {\n", - " \"userid\": [1000, 2000, 3000],\n", + " \"userid\": [1000, 2000, 3000], # Illustration only. Usually read from somewhere.\n", " \"pca_a\": [0.1, 0.2, 0.3],\n", " \"pca_b\": [-0.3, 0.01, 0.7],\n", " }\n", ")\n", "\n", "display(\n", - " Markdown('## Plain dataframe\\n**NOTE:** this also appears in TOC as \"*2.1. Plain dataframe*\"'),\n", + " Markdown(\n", + " '### Plain dataframe\\n**NOTE:** this also appears in TOC as \"*2.2.1. Plain dataframe*\"'\n", + " ),\n", " df_a,\n", " Markdown(\n", - " \"\"\"## Masked dataframe\n", + " \"\"\"### Masked dataframe\n", "Sometime, we would like to version the output of this cell into the git repo, to help readers to\n", "quickly see the shape of a dataframe.\n", "\n", @@ -400,9 +476,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Environment (virtualenv_p39x)", + "display_name": "Environment (virtualenv_ds-p310)", "language": "python", - "name": "virtualenv_p39x" + "name": "virtualenv_ds-p310" }, "language_info": { "codemirror_mode": { @@ -414,7 +490,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.2" + "version": "3.10.2" }, "toc-autonumbering": true, "toc-showcode": false, diff --git a/{{cookiecutter.repo_name}}/setup.py b/{{cookiecutter.repo_name}}/setup.py index 85a7ad6..46f5f88 100644 --- a/{{cookiecutter.repo_name}}/setup.py +++ b/{{cookiecutter.repo_name}}/setup.py @@ -1,3 +1,8 @@ +raise ValueError( + "Baseline setup.py from cookiecutter aws-samples/python-data-science-template. " + "Please review and modify accordingly, then remove this exception" +) + import os from typing import List @@ -55,8 +60,3 @@ def read(fname) -> str: python_requires=">=3.6.0", install_requires=required_packages, ) - -raise ValueError( - "Baseline setup.py from cookiecutter verdimrc/py-ds-template. " - "Please review and modify accordingly, then remove this exception" -) diff --git a/{{cookiecutter.repo_name}}/src/my_nb_color.py b/{{cookiecutter.repo_name}}/src/my_nb_color.py new file mode 100644 index 0000000..f13a467 --- /dev/null +++ b/{{cookiecutter.repo_name}}/src/my_nb_color.py @@ -0,0 +1,44 @@ +"""Convenience module to setup color prints and logs in a Jupyter notebook. + +Dependencies: `loguru`, `rich`. + +Basic usage by an ``.ipynb``: + + >>> # Colorize notebook outputs + >>> from my_nb_color import print, rprint, oprint + >>> + >>> # Test-drive different behavior of print functionalities + >>> d = {"A" * 200, "B" * 200} + >>> print("Colored:", d) + >>> rprint("Colored and wrapped:", d) + >>> oprint("Plain (i.e., Python's original):", d) + >>> display(d) + >>> + >>> # Test-drive loguru + >>> from loguru import logger + >>> for f in (logger.debug, logger.info, logger.success, logger.error): + >>> f("Hello World!") +""" +import sys + + +# Try to setup rich. +try: + import rich +except ModuleNotFoundError: + print = rprint = oprint = print +else: + oprint = print # In-case plain old behavior is needed + rich.reconfigure(force_terminal=True, force_jupyter=False) + rich.pretty.install() + print = rich.get_console().out + rprint = rich.get_console().print + + +# Try to setup loguru. +try: + from loguru import logger +except ModuleNotFoundError: + pass +else: + logger.configure(handlers=[dict(sink=sys.stderr, colorize=True)])