refactor: detect file encoding (#164)

frappe · Sep 15, 2023 · 86d3ba0 · 86d3ba0
1 parent 1a2694a
commit 86d3ba0
Show file tree

Hide file tree

Showing 6 changed files with 170 additions and 25 deletions.
diff --git a/.gitignore b/.gitignore
@@ -6,38 +6,128 @@ locale
 .wnf-lang-status
 *.swp
 *.egg-info
-dist/
-# build/
-.vscode
-.vs
-node_modules
-.kdev4/
-*.kdev4
-*debug.log
-insights/docs/current
-insights/public/frontend
-insights/www/insights.html
-.nyc_output
-coverage/
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
 
 # Distribution / packaging
 .Python
-# build/
+build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
+lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
+share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
 # Environments
 .env
 .venv
@@ -54,6 +144,43 @@ venv.bak/
 # Rope project settings
 .ropeproject
 
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+
+# build/
+.vscode
+.idea
+.vs
+node_modules
+.kdev4/
+*.kdev4
+*debug.log
+insights/docs/current
+insights/public/frontend
+insights/www/insights.html
+.nyc_output
+coverage/
 
 # Runtime data
 pids

diff --git a/insights/api/__init__.py b/insights/api/__init__.py
@@ -1,7 +1,6 @@
 # Copyright (c) 2022, Frappe Technologies Pvt. Ltd. and contributors
 # For license information, please see license.txt
 
-
 import frappe
 from frappe.integrations.utils import make_post_request
 from frappe.rate_limiter import rate_limit
@@ -19,6 +18,7 @@
     get_permission_filter,
 )
 
+from insights.utils import detect_encoding
 
 @frappe.whitelist()
 @check_role("Insights User")
@@ -321,13 +321,13 @@ def get_columns_from_uploaded_file(filename):
         frappe.throw("Only CSV files are supported")
 
     file_path = file.get_full_path()
-    df = pd.read_csv(file_path)
+    encoding = detect_encoding(file_path)
+    df = pd.read_csv(file_path, encoding=encoding)
     columns = df.columns.tolist()
     columns_with_types = []
     for column in columns:
         column_type = infer_type_from_list(df[column].tolist())
         columns_with_types.append({"label": column, "type": column_type})
-
     return columns_with_types
 
 

diff --git a/insights/insights/doctype/insights_data_source/sources/sqlite.py b/insights/insights/doctype/insights_data_source/sources/sqlite.py
@@ -1,7 +1,6 @@
 # Copyright (c) 2022, Frappe Technologies Pvt. Ltd. and contributors
 # For license information, please see license.txt
 
-
 import frappe
 import pandas as pd
 from sqlalchemy import column as Column
@@ -17,6 +16,8 @@
 from .base_database import BaseDatabase
 from .utils import create_insights_table
 
+from insights.utils import detect_encoding
+
 
 class SQLiteTableFactory:
     def __init__(self, data_source) -> None:
@@ -121,9 +122,12 @@ def table_exists(self, table):
         )
 
     def import_table(self, import_doc: InsightsTableImport):
-        df = pd.read_csv(import_doc._filepath)
+        encoding = detect_encoding(import_doc._filepath)
+        df = pd.read_csv(import_doc._filepath, encoding=encoding)
+
         df.columns = [frappe.scrub(c) for c in df.columns]
         columns_to_import = [c.column for c in import_doc.columns]
+
         df = df[columns_to_import]
         table = import_doc.table_name
         df.to_sql(

diff --git a/insights/insights/doctype/insights_table_import/insights_table_import.py b/insights/insights/doctype/insights_table_import/insights_table_import.py
@@ -9,6 +9,8 @@
 from frappe import task
 from frappe.model.document import Document
 
+from insights.utils import detect_encoding
+
 
 class InsightsTableImport(Document):
     def __init__(self, *args, **kwargs):
@@ -42,12 +44,13 @@ def before_save(self):
             self.set_columns_and_no_of_rows()
 
     def set_columns_and_no_of_rows(self):
-        column_names = []
-        with open(self._filepath, "r") as f:
+        encoding = detect_encoding(self._filepath)
+        with open(self._filepath, "r", encoding=encoding, errors="replace") as f:
             # read only the first line to get the column names
-            reader = csv.reader(f)
-            column_names = next(reader)
-            no_of_rows = sum(1 for _ in reader)
+            csv_reader = csv.DictReader(f)
+            column_names = csv_reader.fieldnames
+            rows = list(csv_reader)
+            no_of_rows = len(rows)
 
         self.db_set("rows", no_of_rows)
         for column in column_names:
@@ -77,6 +80,7 @@ def start_import(name, filepath=None):
         table_import = frappe.get_doc("Insights Table Import", name)
         table_import._filepath = filepath or table_import._filepath
         table_import.db_set("status", "Started")
+
         try:
             table_import._data_source.db.import_table(table_import)
             table_import.db_set("status", "Success")

diff --git a/insights/utils.py b/insights/utils.py
@@ -1,7 +1,9 @@
 # Copyright (c) 2022, Frappe Technologies Pvt. Ltd. and contributors
 # For license information, please see license.txt
 
+import pathlib
 from typing import List, Union
+import chardet
 
 import frappe
 
@@ -74,3 +76,10 @@ class InsightsSettings:
     @classmethod
     def get(cls, key):
         return frappe.db.get_single_value("Insights Settings", key)
+
+
+def detect_encoding(file_path: str):
+    file_path: pathlib.Path = pathlib.Path(file_path)
+    with open(file_path, 'rb') as file:
+        result = chardet.detect(file.read())
+    return result['encoding']
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,5 @@
 # frappe -- https://github.com/frappe/frappe is installed via 'bench init'
 pandas==1.5.1
 SQLAlchemy==1.4.43
-python-telegram-bot==12.8
+python-telegram-bot==12.8
+chardet==4.0.0