diff --git a/.gitignore b/.gitignore
index 9c16841..f182639 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,9 +1,123 @@
-build/
-.vscode/
+# IDE
+.idea
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
 *.so
-.python-version
-__pycache__
-*.egg-info
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
 *.egg
-.gdb_history
-.pytest_cache
\ No newline at end of file
+MANIFEST
+*.whl
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+.DS_Store
+
+# data directory
+_download
+
+# CTags & CScope
+tags
+cscope.*
+
+# Vim
+*.swp
+*.swo
+*.un~
+*~
diff --git a/CMakeLists.txt b/CMakeLists.txt
index eac303e..18ad66f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,7 +4,8 @@ project(tfdlpack C CXX)
 
 add_definitions(-std=c++11 -fPIC)
 include(cmake/util/FindCUDA.cmake)	
-option(NO_CUDA "Only build TFDLPACK with cpu" OFF)
+
+option(USE_CUDA "Build TF-DLPACK with CUDA support" ON)
 
 if(NOT PYTHON_EXECUTABLE)
   execute_process(
@@ -35,27 +36,27 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2 ${TF_CFLAGS}")
 set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O2 -D_GLIBCXX_USE_CXX11_ABI=0 ${TF_CFLAGS}")
 set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -g3 -ggdb -D_GLIBCXX_USE_CXX11_ABI=0 ${TF_CFLAGS}")
 
-if (NOT NO_CUDA)
+if (USE_CUDA)
+  message(STATUS "Build with CUDA.")
   add_definitions(-DTFDLPACK_USE_CUDA)
   find_cuda(ON REQUIRED)
   if(NOT CUDA_FOUND)	
     message(FATAL_ERROR "Cannot find CUDA.")	
   endif()
+else()
+  message(STATUS "Build without CUDA.")
 endif()
 
 include_directories(third_party/dlpack/include)
 
 file(GLOB SRC
-  src/dlpack_op.cc
-  src/to_dlpack_kernel.cc
-  src/get_device_and_dtype_kernel.cc
-  src/from_dlpack_kernel.cc
+  src/*.cc
 )
 
-if (NO_CUDA)
-  add_library(tfdlpack SHARED ${SRC})
-else()
+if (USE_CUDA)
   cuda_add_library(tfdlpack SHARED ${SRC})
+else()
+  add_library(tfdlpack SHARED ${SRC})
 endif()
 
 target_link_libraries(tfdlpack ${TF_LFLAGS})
diff --git a/Jenkinsfile b/Jenkinsfile
index bfee5f7..48275be 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -11,7 +11,7 @@ pipeline {
     stage("Lint Check") {
       agent { 
         docker { 
-          image "dgllib/tfdlpack-test" 
+          image "dgllib/tfdlpack-ci-gpu" 
           args "--runtime nvidia"
         } 
       }
@@ -30,13 +30,13 @@ pipeline {
     stage("Build and Test") {
       agent { 
         docker { 
-          image "dgllib/tfdlpack-test" 
+          image "dgllib/tfdlpack-ci-gpu" 
           args "--runtime nvidia"
         } 
       }
       steps {
         init_git()
-        sh "python -m pip install ."
+        sh "bash tests/scripts/task_build.sh"
         sh "python -m pytest tests"
       }
       post {
@@ -45,6 +45,5 @@ pipeline {
         }
       }
     }
-
   }
-}
\ No newline at end of file
+}
diff --git a/README.md b/README.md
index 15a2ecb..56480e3 100644
--- a/README.md
+++ b/README.md
@@ -3,51 +3,49 @@
 
 Notes: Currently only tested under tensorflow 2.0's eager mode. Implementation details could be found [here](https://github.com/VoVAllen/tf-dlpack/issues/3).
 
-
 ## Install
-Set allow growth, otherwise tf would take over whole gpu
-```bash
-export TF_FORCE_GPU_ALLOW_GROWTH=true
-```
 
-### Pip install
+Pip install
 ```bash
-pip install git+https://github.com/VoVAllen/tf-dlpack.git
+pip install tfdlpack  # no cuda
+# pip install tfdlpack-gpu  # with cuda support
 ```
 
-### Local install
+## Usage
+Set allow growth, otherwise tf would take over whole gpu
 ```bash
-python setup.py install
-# or
-pip install .
+export TF_FORCE_GPU_ALLOW_GROWTH=true
 ```
 
-## Usage
+Use `tfdlpack`
+
 ```python
 import tfdlpack
 dl_capsule = tfdlpack.to_dlpack(tf_tensor)    # Convert tf tensor to dlpack capsule
 tf_tensor = tfdlpack.from_dlpack(dl_capsule)  # Convert dlpack capsule to tf tensor
 ```
 
+## Build and develop locally
 
-## Build Manually
-
-Build
+Build plugin library
 ```
 mkdir build
 cd build
-cmake ..
+cmake ..  # To build without CUDA, add -DUSE_CUDA=OFF
 make -j4
 ```
 
-so file path is now fixed in `python/tfdlpack/__init__.py`
-Need to change manually
+Export the library path:
+```bash
+export TFDLPACK_LIBRARY_PATH=/path/to/tf-dlpack/repo/build
+```
 
-And export the python path to `import tfdlpack`
+Export python path to `import tfdlpack`
 ```bash
-export PYTHONPATH=/home/ubuntu/dev/tfdlpack/python/:${PYTHONPATH}
+export PYTHONPATH=/path/to/tf-dlpack/repo/python/:${PYTHONPATH}
 ```
 
+
 ## License
 
 [Apache License 2.0](LICENSE)
diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu
new file mode 100644
index 0000000..b2dc99e
--- /dev/null
+++ b/docker/Dockerfile.cpu
@@ -0,0 +1,10 @@
+FROM tensorflow/tensorflow:2.0.0-py3
+
+COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
+RUN bash /install/ubuntu_install_core.sh
+
+RUN pip3 install pytest cpplint pylint
+RUN pip3 install https://download.pytorch.org/whl/cu100/torch-1.3.1%2Bcu100-cp36-cp36m-linux_x86_64.whl
+
+COPY install/ubuntu_install_conda.sh /install/ubuntu_install_conda.sh
+RUN bash /install/ubuntu_install_conda.sh cpu
diff --git a/docker/Dockerfile.ci b/docker/Dockerfile.gpu
similarity index 62%
rename from docker/Dockerfile.ci
rename to docker/Dockerfile.gpu
index 8586e85..8f9f531 100644
--- a/docker/Dockerfile.ci
+++ b/docker/Dockerfile.gpu
@@ -1,8 +1,10 @@
-
 FROM tensorflow/tensorflow:2.0.0-gpu-py3
 
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
 RUN bash /install/ubuntu_install_core.sh
 
 RUN pip3 install pytest gpustat cpplint pylint
-RUN pip3 install https://download.pytorch.org/whl/cu100/torch-1.3.1%2Bcu100-cp36-cp36m-linux_x86_64.whl
\ No newline at end of file
+RUN pip3 install https://download.pytorch.org/whl/cu100/torch-1.3.1%2Bcu100-cp36-cp36m-linux_x86_64.whl
+
+COPY install/ubuntu_install_conda.sh /install/ubuntu_install_conda.sh
+RUN bash /install/ubuntu_install_conda.sh gpu
diff --git a/docker/build_docker.sh b/docker/build_docker.sh
index 64b2f2d..e1352e5 100644
--- a/docker/build_docker.sh
+++ b/docker/build_docker.sh
@@ -1,3 +1,4 @@
 #/usr/bin/sh
 # From current directory
-docker build -t dgllib/tfdlpack-test -f Dockerfile.ci .
\ No newline at end of file
+docker build -t dgllib/tfdlpack-ci-cpu -f Dockerfile.cpu .
+docker build -t dgllib/tfdlpack-ci-gpu -f Dockerfile.gpu .
diff --git a/docker/install/ubuntu_install_conda.sh b/docker/install/ubuntu_install_conda.sh
new file mode 100644
index 0000000..616358f
--- /dev/null
+++ b/docker/install/ubuntu_install_conda.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+DEV=$1
+
+if [ $DEV = "cpu" ]; then
+  TF="tensorflow"
+  TH="pytorch cpuonly"
+else
+  TF="tensorflow-gpu"
+  TH="pytorch"
+fi
+
+wget -O /tmp/install.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
+sh /tmp/install.sh -b
+
+CONDA_PREFIX=$HOME/miniconda3/bin
+export PATH=$CONDA_PREFIX:$PATH
+for PY_VER in 3.6.4 3.7.0; do
+  echo "Create conda env for python $PY_VER"
+  conda create -n $PY_VER -y python=$PY_VER
+  source activate $PY_VER
+  conda install -y $TF==2.0 pytest
+  echo conda install -y $TH -c pytorch
+  conda install -y $TH -c pytorch
+  source deactivate
+done
diff --git a/python/setup.py b/python/setup.py
new file mode 100644
index 0000000..4edaa90
--- /dev/null
+++ b/python/setup.py
@@ -0,0 +1,126 @@
+import os
+import re
+import sys
+import shutil
+import platform
+import subprocess
+
+from setuptools import find_packages
+from setuptools import setup, Extension
+from setuptools.dist import Distribution
+from setuptools.command.build_ext import build_ext
+from distutils.version import LooseVersion
+
+class BinaryDistribution(Distribution):
+    def has_ext_modules(self):
+        return True
+
+CURRENT_DIR = os.path.dirname(__file__)
+
+def get_lib_path():
+    """Get library path, name and version"""
+     # We can not import `libinfo.py` in setup.py directly since __init__.py
+    # Will be invoked which introduces dependences
+    libinfo_py = os.path.join(CURRENT_DIR, 'tfdlpack', 'libinfo.py')
+    libinfo = {'__file__': libinfo_py}
+    exec(compile(open(libinfo_py, "rb").read(), libinfo_py, 'exec'), libinfo, libinfo)
+    version = libinfo['__version__']
+
+    lib_path = libinfo['find_lib_path']()
+    libs = [lib_path[0]]
+
+    return libs, version
+
+LIBS, VERSION = get_lib_path()
+
+class CMakeExtension(Extension):
+    def __init__(self, name, sourcedir=''):
+        Extension.__init__(self, name, sources=[])
+        self.sourcedir = os.path.abspath(sourcedir)
+
+
+class CMakeBuild(build_ext):
+    def run(self):
+        try:
+            out = subprocess.check_output(['cmake', '--version'])
+        except OSError:
+            raise RuntimeError("CMake must be installed to build the following extensions: " +
+                               ", ".join(e.name for e in self.extensions))
+
+        if platform.system() == "Windows":
+            raise RuntimeError("Windows not currently supported")
+
+        for ext in self.extensions:
+            self.build_extension(ext)
+
+    def build_extension(self, ext):
+        extdir = os.path.abspath(os.path.dirname(
+            self.get_ext_fullpath(ext.name)))
+        extdir = os.path.join(extdir, "tfdlpack", "build") # Not sure whether this is fine
+
+        cmake_args = ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + extdir,
+                      '-DPYTHON_EXECUTABLE=' + sys.executable]
+
+        cfg = 'Debug' if self.debug else 'Release'
+        build_args = ['--config', cfg]
+        env = os.environ.copy()
+        if not os.path.exists(self.build_temp):
+            os.makedirs(self.build_temp)
+        subprocess.check_call(['cmake', ext.sourcedir] +
+                              cmake_args, cwd=self.build_temp, env=env)
+        subprocess.check_call(['cmake', '--build', '.'] +
+                              build_args, cwd=self.build_temp)
+
+include_libs = False
+wheel_include_libs = False
+if "bdist_wheel" in sys.argv or os.getenv('CONDA_BUILD'):
+    wheel_include_libs = True
+else:
+    include_libs = True
+
+setup_kwargs = {}
+
+# For bdist_wheel only
+if wheel_include_libs:
+    with open("MANIFEST.in", "w") as fo:
+        for path in LIBS:
+            shutil.copy(path, os.path.join(CURRENT_DIR, 'tfdlpack'))
+            _, libname = os.path.split(path)
+            fo.write("include tfdlpack/%s\n" % libname)
+    setup_kwargs = {
+        "include_package_data": True
+    }
+
+# For source tree setup
+# Conda build also includes the binary library
+if include_libs:
+    rpath = [os.path.relpath(path, CURRENT_DIR) for path in LIBS]
+    setup_kwargs = {
+        "include_package_data": True,
+        "data_files": [('tfdlpack', rpath)]
+    }
+
+setup(
+    name='tfdlpack' + os.getenv('TFDLPACK_PACKAGE_SUFFIX', ''),
+    version=VERSION,
+    author='Jinjing Zhou',
+    author_email='allen.zhou@nyu.edu',
+    description='Tensorflow plugin for DLPack',
+    packages=find_packages(),
+    install_requires=['tensorflow%s>=2.0.0' % os.getenv('TFDLPACK_PACKAGE_SUFFIX')],
+    long_description="""
+The package adds interoperability of DLPack to Tensorflow. It contains straightforward
+and easy-to-use APIs to convert Tensorflow tensors from/to DLPack format.
+    """,
+    distclass=BinaryDistribution,
+    zip_safe=False,
+    license='APACHE',
+    **setup_kwargs
+)
+
+if wheel_include_libs:
+    # Wheel cleanup
+    os.remove("MANIFEST.in")
+    for path in LIBS:
+        _, libname = os.path.split(path)
+        os.remove(os.path.join(CURRENT_DIR, 'tfdlpack', libname))
diff --git a/python/tfdlpack/core.py b/python/tfdlpack/core.py
index 00591b9..5e3d986 100644
--- a/python/tfdlpack/core.py
+++ b/python/tfdlpack/core.py
@@ -10,7 +10,6 @@
 # version number
 __version__ = libinfo.__version__
 
-# print(libinfo.find_lib_path()[0])
 dlpack_ops = load_library.load_op_library(libinfo.find_lib_path()[0])
 _to_dlpack_address = dlpack_ops.to_dlpack
 _from_dlpack = dlpack_ops.from_dlpack
diff --git a/python/tfdlpack/libinfo.py b/python/tfdlpack/libinfo.py
index 7bc0f0b..2d49f7c 100644
--- a/python/tfdlpack/libinfo.py
+++ b/python/tfdlpack/libinfo.py
@@ -19,19 +19,17 @@ def find_lib_path(name=None, search_path=None, optional=False):
     """
     # See https://github.com/dmlc/tvm/issues/281 for some background.
 
-    # NB: This will either be the source directory (if DGL is run
-    # inplace) or the install directory (if DGL is installed).
-    # An installed DGL's curr_path will look something like:
-    #   $PREFIX/lib/python3.6/site-packages/tfdlpack/_ffi
+    # NB: This will either be the source directory (if tfdlpack is run
+    # inplace) or the install directory (if tfdlpack is installed).
+    # An installed tfdlpack's curr_path will look something like:
+    #   $PREFIX/lib/python3.6/site-packages/tfdlpack
     source_dir = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
     install_lib_dir = os.path.join(source_dir, "build")
-    # source_dir = os.path.join(ffi_dir, "..", "..")
-    # install_lib_dir = os.path.join(ffi_dir, "..", "..", "..")
 
     dll_path = []
 
-    if os.environ.get('TF_DLPACK_LIBRARY_PATH', None):
-        dll_path.append(os.environ['TF_DLPACK_LIBRARY_PATH'])
+    if os.environ.get('TFDLPACK_LIBRARY_PATH', None):
+        dll_path.append(os.environ['TFDLPACK_LIBRARY_PATH'])
 
     if sys.platform.startswith('linux') and os.environ.get('LD_LIBRARY_PATH', None):
         dll_path.extend([p.strip() for p in os.environ['LD_LIBRARY_PATH'].split(":")])
@@ -39,7 +37,7 @@ def find_lib_path(name=None, search_path=None, optional=False):
         dll_path.extend([p.strip() for p in os.environ['DYLD_LIBRARY_PATH'].split(":")])
 
     # Pip lib directory
-    # dll_path.append(os.path.join(ffi_dir, ".."))
+    dll_path.append(source_dir)
     # Default cmake build directory
     dll_path.append(os.path.join(source_dir, "build"))
     dll_path.append(os.path.join(source_dir, "build", "Release"))
diff --git a/setup.py b/setup.py
deleted file mode 100644
index b01f995..0000000
--- a/setup.py
+++ /dev/null
@@ -1,63 +0,0 @@
-import os
-import re
-import sys
-import platform
-import subprocess
-
-from setuptools import setup, Extension
-from setuptools.command.build_ext import build_ext
-from distutils.version import LooseVersion
-
-
-class CMakeExtension(Extension):
-    def __init__(self, name, sourcedir=''):
-        Extension.__init__(self, name, sources=[])
-        self.sourcedir = os.path.abspath(sourcedir)
-
-
-class CMakeBuild(build_ext):
-    def run(self):
-        try:
-            out = subprocess.check_output(['cmake', '--version'])
-        except OSError:
-            raise RuntimeError("CMake must be installed to build the following extensions: " +
-                               ", ".join(e.name for e in self.extensions))
-
-        if platform.system() == "Windows":
-            raise RuntimeError("Windows not currently supported")
-
-        for ext in self.extensions:
-            self.build_extension(ext)
-
-    def build_extension(self, ext):
-        extdir = os.path.abspath(os.path.dirname(
-            self.get_ext_fullpath(ext.name)))
-        extdir = os.path.join(extdir, "tfdlpack", "build") # Not sure whether this is fine
-
-        cmake_args = ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + extdir,
-                      '-DPYTHON_EXECUTABLE=' + sys.executable]
-
-        cfg = 'Debug' if self.debug else 'Release'
-        build_args = ['--config', cfg]
-        env = os.environ.copy()
-        if not os.path.exists(self.build_temp):
-            os.makedirs(self.build_temp)
-        subprocess.check_call(['cmake', ext.sourcedir] +
-                              cmake_args, cwd=self.build_temp, env=env)
-        subprocess.check_call(['cmake', '--build', '.'] +
-                              build_args, cwd=self.build_temp)
-
-
-setup(
-    name='tfdlpack',
-    version='0.0.1',
-    author='DGL Team',
-    author_email='allen.zhou@nyu.edu',
-    description='DLPack for tensorflow',
-    package_dir={"tfdlpack": "python/tfdlpack/"},
-    packages=["tfdlpack"],
-    long_description='',
-    ext_modules=[CMakeExtension('tfdlpack')],
-    cmdclass=dict(build_ext=CMakeBuild),
-    zip_safe=False,
-)
diff --git a/src/dlpack_op.cc b/src/dlpack_op.cc
index 05bebcf..d0d0680 100644
--- a/src/dlpack_op.cc
+++ b/src/dlpack_op.cc
@@ -1,5 +1,5 @@
 /*!
- *  Copyright (c) 2019 by Contributors
+ * Copyright (c) 2019 by Contributors
  * \file dlpack_op.cc
  * \brief dlpack op registration
  */
diff --git a/src/from_dlpack_kernel.cc b/src/from_dlpack_kernel.cc
index 3b00f2c..829e10b 100644
--- a/src/from_dlpack_kernel.cc
+++ b/src/from_dlpack_kernel.cc
@@ -1,5 +1,5 @@
 /*!
- *  Copyright (c) 2019 by Contributors
+ * Copyright (c) 2019 by Contributors
  * \file from_dlpack_kernel.cc
  * \brief from dlpack kernel
  */
@@ -11,31 +11,28 @@
 #include <tensorflow/core/framework/op_kernel.h>
 #include <tensorflow/core/framework/tensor_reference.h>
 #include <cstdio>
-#include "util.h"
+#include "./util.h"
 
 using namespace tensorflow;
-namespace tf = tensorflow;
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-inline bool isAligned(size_t alignment, void *data_ptr) {
+inline bool IsAligned(size_t alignment, void *data_ptr) {
   auto iptr = reinterpret_cast<std::uintptr_t>(data_ptr);
   return (iptr % alignment == 0);
 }
 
 class DLPackAllocator : public Allocator {
  public:
-  static constexpr size_t kAllocatorAlignment = 64;
-
   explicit DLPackAllocator(DLManagedTensor *dlm_tensor) {
     dlm_tensor_ = dlm_tensor;
     data_ = dlm_tensor->dl_tensor.data;
 
     // Shape
     shape_ = TensorShape();
-    int ndim = dlm_tensor->dl_tensor.ndim;
-    int64_t *shape = dlm_tensor->dl_tensor.shape;
+    const int ndim = dlm_tensor->dl_tensor.ndim;
+    const int64_t *shape = dlm_tensor->dl_tensor.shape;
     for (int i = 0; i < ndim; i++) {
       shape_.AddDim(shape[i]);
     }
@@ -50,7 +47,7 @@ class DLPackAllocator : public Allocator {
           errors::Internal("Invalid number of bytes for DLPack Tensor");
       return nullptr;
     }
-    if (isAligned(alignment, data_)) {
+    if (IsAligned(alignment, data_)) {
       return data_;
     } else {
       allocation_status_ =
@@ -62,7 +59,6 @@ class DLPackAllocator : public Allocator {
   void DeallocateRaw(void *ptr) {
     // This would lead to double free, haven't figure out the problem
     dlm_tensor_->deleter(const_cast<DLManagedTensor *>(dlm_tensor_));
-    // std::cout << "Deconstruct dlpack tensor" << std::endl;
     delete this;
   }
 
@@ -79,6 +75,8 @@ class DLPackAllocator : public Allocator {
   int64 num_elements_;
   TensorShape shape_;
   Status allocation_status_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(DLPackAllocator);
 };
 
 class FromDLPackOP : public OpKernel {
@@ -93,9 +91,9 @@ class FromDLPackOP : public OpKernel {
 
     DLPackAllocator *dlpack_allocator = new DLPackAllocator(dlm_tensor);
     // Alignment is always 64 bytes for CPU and GPU in TF
-    if (isAligned(64, dlm_tensor->dl_tensor.data)) {
+    if (IsAligned(64, dlm_tensor->dl_tensor.data)) {
       // Aligned tensor using DLPackAllocator to allocate memory
-      DataType tf_dtype = toTFDataType(dtype);
+      DataType tf_dtype = ToTFDataType(dtype);
       Tensor output_tensor(dlpack_allocator, tf_dtype,
                            dlpack_allocator->get_shape());
       OP_REQUIRES_OK(context, dlpack_allocator->allocation_status());
diff --git a/src/get_device_and_dtype_kernel.cc b/src/get_device_and_dtype_kernel.cc
index 08a30d2..a5835e1 100644
--- a/src/get_device_and_dtype_kernel.cc
+++ b/src/get_device_and_dtype_kernel.cc
@@ -1,81 +1,18 @@
 /*!
- *  Copyright (c) 2019 by Contributors
+ * Copyright (c) 2019 by Contributors
  * \file get_device_and_dtype_kernel.cc
  * \brief get device and dtype kernel
  */
 #include <dlpack/dlpack.h>
 #include <tensorflow/core/framework/op_kernel.h>
 #include <cstdint>
-#include "util.h"
+#include "./util.h"
 
 using namespace tensorflow;
-namespace tf = tensorflow;
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-DataType toTFDataType(const DLDataType &dtype) {
-  DataType tf_dtype = DT_INVALID;
-  int code = dtype.code;
-  int bits = dtype.bits;
-  switch (code) {
-    case kDLUInt:
-      switch (bits) {
-        case 8:
-          tf_dtype = DT_UINT8;
-          break;
-        case 16:
-          tf_dtype = DT_UINT16;
-          break;
-        case 32:
-          tf_dtype = DT_UINT32;
-          break;
-        case 64:
-          tf_dtype = DT_UINT64;
-          break;
-        default:
-          LOG(INFO) << "Unsupported kUInt bits";
-      }
-      break;
-    case kDLInt:
-      switch (bits) {
-        case 8:
-          tf_dtype = DT_INT8;
-          break;
-        case 16:
-          tf_dtype = DT_INT16;
-          break;
-        case 32:
-          tf_dtype = DT_INT32;
-          break;
-        case 64:
-          tf_dtype = DT_INT64;
-          break;
-        default:
-          LOG(INFO) << "Unsupported kInt bits";
-      }
-      break;
-    case kDLFloat:
-      switch (bits) {
-        case 16:
-          tf_dtype = DT_HALF;
-          break;
-        case 32:
-          tf_dtype = DT_FLOAT;
-          break;
-        case 64:
-          tf_dtype = DT_DOUBLE;
-          break;
-        default:
-          LOG(INFO) << "Unsupported kFloat bits";
-      }
-      break;
-    default:
-      LOG(INFO) << "Unsupported code";
-  }
-  return tf_dtype;
-}
-
 class GetDeviceAndDTypeOP : public OpKernel {
  public:
   explicit GetDeviceAndDTypeOP(OpKernelConstruction *context) : OpKernel(context) {}
@@ -89,7 +26,7 @@ class GetDeviceAndDTypeOP : public OpKernel {
     auto output_flat = output_tensor->flat<int32>();
     output_flat(0) = dl_tensor->dl_tensor.ctx.device_type;
     output_flat(1) = dl_tensor->dl_tensor.ctx.device_id;
-    output_flat(2) = toTFDataType(dl_tensor->dl_tensor.dtype);
+    output_flat(2) = ToTFDataType(dl_tensor->dl_tensor.dtype);
   }
 };
 
diff --git a/src/to_dlpack_kernel.cc b/src/to_dlpack_kernel.cc
index fff3fbc..ef871c5 100644
--- a/src/to_dlpack_kernel.cc
+++ b/src/to_dlpack_kernel.cc
@@ -1,5 +1,5 @@
 /*!
- *  Copyright (c) 2019 by Contributors
+ * Copyright (c) 2019 by Contributors
  * \file to_dlpack_kernel.cc
  * \brief to dlpack kernel
  */
diff --git a/src/util.cc b/src/util.cc
new file mode 100644
index 0000000..574fb6b
--- /dev/null
+++ b/src/util.cc
@@ -0,0 +1,72 @@
+/*!
+ * Copyright (c) 2019 by Contributors
+ * \file util.cc
+ * \brief Utility functions
+ */
+#include "./util.h"
+
+using namespace tensorflow;
+
+DataType ToTFDataType(const DLDataType &dtype) {
+  DataType tf_dtype = DT_INVALID;
+  int code = dtype.code;
+  int bits = dtype.bits;
+  switch (code) {
+    case kDLUInt:
+      switch (bits) {
+        case 8:
+          tf_dtype = DT_UINT8;
+          break;
+        case 16:
+          tf_dtype = DT_UINT16;
+          break;
+        case 32:
+          tf_dtype = DT_UINT32;
+          break;
+        case 64:
+          tf_dtype = DT_UINT64;
+          break;
+        default:
+          LOG(INFO) << "Unsupported kUInt bits";
+      }
+      break;
+    case kDLInt:
+      switch (bits) {
+        case 8:
+          tf_dtype = DT_INT8;
+          break;
+        case 16:
+          tf_dtype = DT_INT16;
+          break;
+        case 32:
+          tf_dtype = DT_INT32;
+          break;
+        case 64:
+          tf_dtype = DT_INT64;
+          break;
+        default:
+          LOG(INFO) << "Unsupported kInt bits";
+      }
+      break;
+    case kDLFloat:
+      switch (bits) {
+        case 16:
+          tf_dtype = DT_HALF;
+          break;
+        case 32:
+          tf_dtype = DT_FLOAT;
+          break;
+        case 64:
+          tf_dtype = DT_DOUBLE;
+          break;
+        default:
+          LOG(INFO) << "Unsupported kFloat bits";
+      }
+      break;
+    default:
+      LOG(INFO) << "Unsupported code";
+  }
+  return tf_dtype;
+}
+
+
diff --git a/src/util.h b/src/util.h
index a80a712..b965b19 100644
--- a/src/util.h
+++ b/src/util.h
@@ -3,12 +3,17 @@
  * \file util.h
  * \brief TF data type utilities
  */
-
-#pragma once
+#ifndef TFDLPACK_UTIL_H_
+#define TFDLPACK_UTIL_H_
 
 #include <dlpack/dlpack.h>
 #include <tensorflow/core/framework/op_kernel.h>
 
-using namespace tensorflow;
+/*!
+ * \brief Convert a DLPack data type object to tensorflow DataType object.
+ * \param dtype DLPack dtype object
+ * \return tensorflow dtype object
+ */
+tensorflow::DataType ToTFDataType(const DLDataType &dtype);
 
-DataType toTFDataType(const DLDataType &dtype);
+#endif  // TFDLPACK_UTIL_H_
diff --git a/tests/scripts/README.md b/tests/scripts/README.md
new file mode 100644
index 0000000..3afd5f4
--- /dev/null
+++ b/tests/scripts/README.md
@@ -0,0 +1,17 @@
+Release steps
+===
+
+Suppose you are publishing a new version 0.x.
+
+Linux
+---
+
+First, build the wheels.
+
+```bash
+bash build_release.sh 0.x
+```
+
+The script assumes there is a 0.x branch or tag in the git repository. If successful, wheels are stored in the `cpu-release` and `gpu-release` folders.
+
+Then, upload the wheels to S3 or pypi.
diff --git a/tests/scripts/build_in_docker.sh b/tests/scripts/build_in_docker.sh
new file mode 100644
index 0000000..41f87b8
--- /dev/null
+++ b/tests/scripts/build_in_docker.sh
@@ -0,0 +1,42 @@
+#/bin/bash
+
+set -e
+
+BRANCH=$1
+USE_CUDA=$2
+
+pushd /tmp
+git clone https://github.com/VoVAllen/tf-dlpack.git --recursive
+pushd tf-dlpack
+git checkout $BRANCH
+
+CONDA_PREFIX=$HOME/miniconda3/bin
+export PATH=$CONDA_PREFIX:$PATH
+export PYTHONPATH=$PWD/python:$PYTHONPATH
+export TFDLPACK_LIBRARY_PATH=$PWD/build
+for PY_VER in 3.6.4 3.7.0; do
+  echo "Build for python $PY_VER"
+  source activate $PY_VER
+  # clean & build
+  rm -rf build
+  mkdir build
+  cd build; cmake -DUSE_CUDA=$USE_CUDA ..; make -j; cd ..
+  # test
+  if [ $USE_CUDA = "ON" ]; then
+    python -m pytest tests
+    export TFDLPACK_PACKAGE_SUFFIX=-gpu
+  else
+    export TFDLPACK_PACKAGE_SUFFIX=
+  fi
+  # build wheel
+  pushd python
+  python setup.py clean
+  python setup.py bdist_wheel --plat-name manylinux1_x86_64
+  popd
+  source deactivate
+done
+
+cp python/dist/*.whl /workspace
+
+popd
+popd
diff --git a/tests/scripts/build_release.sh b/tests/scripts/build_release.sh
new file mode 100644
index 0000000..b121d98
--- /dev/null
+++ b/tests/scripts/build_release.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+set -e
+
+BRANCH=$1
+
+rm -rf *.whl
+
+docker run -it --rm -v "$PWD":/workspace \
+  --name tfdlpack-build-cpu dgllib/tfdlpack-ci-cpu:latest \
+  bash /workspace/build_in_docker.sh $BRANCH OFF
+
+docker run -it --rm -v "$PWD":/workspace \
+  --runtime nvidia \
+  --name tfdlpack-build-gpu dgllib/tfdlpack-ci-gpu:latest \
+  bash /workspace/build_in_docker.sh $BRANCH ON
diff --git a/tests/scripts/task_build.sh b/tests/scripts/task_build.sh
new file mode 100644
index 0000000..23f254d
--- /dev/null
+++ b/tests/scripts/task_build.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+set -e
+
+if [ -d build ]; then
+  rm -rf build
+fi
+
+mkdir build
+
+pushd build
+cmake ..
+make -j4
+popd
+
+export TFDLPACK_LIBRARY_PATH=$PWD/build
+export TFDLPACK_PACKAGE_SUFFIX=-gpu
+
+pushd python
+python3 setup.py clean
+python3 setup.py install
+popd
diff --git a/tests/test_zero_copy.py b/tests/test_zero_copy.py
index b7b3c66..bc884f2 100644
--- a/tests/test_zero_copy.py
+++ b/tests/test_zero_copy.py
@@ -2,10 +2,10 @@
 import torch as th
 from torch.utils.dlpack import from_dlpack, to_dlpack
 from tfdlpack import from_dlpack as tf_from_dlpack
-import gpustat
 import pytest
 
 def get_gpu_memory_used():
+    import gpustat
     gpu_query = gpustat.GPUStatCollection.new_query()
     gmem_used = gpu_query[0].memory_used
     return gmem_used
@@ -28,4 +28,4 @@ def test_zero_copy():
     print(m3)
 
 if __name__ == "__main__":
-    test_zero_copy()
\ No newline at end of file
+    test_zero_copy()