Merge pull request #2 from quant-sci/major_organization

Major organization
quant-sci · Apr 22, 2024 · 05fd873 · 05fd873
2 parents 698fe63 + b79411e
commit 05fd873
Show file tree

Hide file tree

Showing 288 changed files with 23,453 additions and 274 deletions.
diff --git a/README.md b/README.md
@@ -6,6 +6,8 @@
 [![GitHub](https://img.shields.io/github/license/kleyt0n/optymus)](https://github.com/kleyt0n/optymus/blob/master/LICENSE)
 [![Documentation Status](https://readthedocs.org/projects/optymus/badge/?version=latest)](https://optymus.readthedocs.io/en/latest/?badge=latest)
 
+> Check the [Optimization in Deep Learning and Engineering](https://quantsci.org/odle-book) material.
+
 > Optymus is part of [quantsci](https://quantsci.org) project.
 
 This library provides a comprehensive collection of optimization methods, both with and without constraints. The main goal is provide a simple structure to improve research and development in optimization problems.
@@ -35,23 +37,21 @@ To begin using _optymus_, follow these steps:
 
 3. **Get Started:**
    ```python
-   from optymus.minimize import Optimizer
+   from optymus.optim import Optimizer
+   from optymus.utils import mccormick_function
    
    import numpy as np
-   f = lambda x: x[0]**[2]-3*x[0]*x[1]+4*x[1]**2+x[0]-x[1]
-   grad = lambda x: np.array([2*x[0]-3*x[1]+1, -3*x[0]+8*x[1]-1])
-   hess = lambda x: np.array([[2, -3], [-3, 8]])
+
+   f = mccormick_function()
    initial_point = np.array([2, 2])
 
-   optimizer = Optimizer(f_obj=f,
-                        x0=initial_point,
-                        grad=grad,
-                        hess=hess,
-                        method='bfgs')
+   opt = Optimizer(f_obj=f,
+                   x0=initial_point,
+                   method='bfgs')
 
-   optimizer.report()
+   opt.report()
 
-   optimizer.plot()
+   opt.plot()
    ```
 
 Refer to the documentation for detailed information on each method and its application.
@@ -69,10 +69,11 @@ Contributions to Optymus are highly appreciated. If you have additional optimiza
 If you use Optymus in your research, please consider citing the library using the following BibTeX entry:
 
 ```bibtex
-@misc{optymus2024costa,
+@misc{optymus2024,
   author = {Costa, Kleyton and Menezes, Ivan},
   title = {Optymus: Optimization Methods Library for Python},
   year = {2024},
   note = {GitHub Repository},
-  url = {https://github.com/kleyt0n/optymus}
+  url = {https://github.com/quantsci/optymus}
 }
+```
diff --git a/book/_build/.doctrees/appendix/benchmark_functions.doctree b/book/_build/.doctrees/appendix/benchmark_functions.doctree
diff --git a/book/_build/.doctrees/appendix/libraries.doctree b/book/_build/.doctrees/appendix/libraries.doctree
diff --git a/book/_build/.doctrees/appendix/math_review.doctree b/book/_build/.doctrees/appendix/math_review.doctree
diff --git a/book/_build/.doctrees/appendix/optymus.doctree b/book/_build/.doctrees/appendix/optymus.doctree
diff --git a/book/_build/.doctrees/chapters/ch1/challenges.doctree b/book/_build/.doctrees/chapters/ch1/challenges.doctree
diff --git a/book/_build/.doctrees/chapters/ch1/differences.doctree b/book/_build/.doctrees/chapters/ch1/differences.doctree
diff --git a/book/_build/.doctrees/chapters/ch1/introduction.doctree b/book/_build/.doctrees/chapters/ch1/introduction.doctree
diff --git a/book/_build/.doctrees/chapters/ch2/batch.doctree b/book/_build/.doctrees/chapters/ch2/batch.doctree
diff --git a/book/_build/.doctrees/chapters/ch2/important_concepts.doctree b/book/_build/.doctrees/chapters/ch2/important_concepts.doctree
diff --git a/book/_build/.doctrees/chapters/ch2/stochastic_functions.doctree b/book/_build/.doctrees/chapters/ch2/stochastic_functions.doctree
diff --git a/book/_build/.doctrees/chapters/ch2/surrogate.doctree b/book/_build/.doctrees/chapters/ch2/surrogate.doctree
diff --git a/book/_build/.doctrees/chapters/ch3/adaptative_learning.doctree b/book/_build/.doctrees/chapters/ch3/adaptative_learning.doctree
diff --git a/book/_build/.doctrees/chapters/ch3/first_order.doctree b/book/_build/.doctrees/chapters/ch3/first_order.doctree
diff --git a/book/_build/.doctrees/chapters/ch3/linear_search.doctree b/book/_build/.doctrees/chapters/ch3/linear_search.doctree
diff --git a/book/_build/.doctrees/chapters/ch3/momentum.doctree b/book/_build/.doctrees/chapters/ch3/momentum.doctree
diff --git a/book/_build/.doctrees/chapters/ch3/optimization_algorithms.doctree b/book/_build/.doctrees/chapters/ch3/optimization_algorithms.doctree
diff --git a/book/_build/.doctrees/chapters/ch3/second_order.doctree b/book/_build/.doctrees/chapters/ch3/second_order.doctree
diff --git a/book/_build/.doctrees/chapters/ch3/zero_order.doctree b/book/_build/.doctrees/chapters/ch3/zero_order.doctree
diff --git a/book/_build/.doctrees/chapters/ch4/information_algorithms.doctree b/book/_build/.doctrees/chapters/ch4/information_algorithms.doctree
diff --git a/book/_build/.doctrees/chapters/ch5/experiments.doctree b/book/_build/.doctrees/chapters/ch5/experiments.doctree
diff --git a/book/_build/.doctrees/chapters/conclusion.doctree b/book/_build/.doctrees/chapters/conclusion.doctree
diff --git a/book/_build/.doctrees/environment.pickle b/book/_build/.doctrees/environment.pickle
diff --git a/book/_build/.doctrees/intro.doctree b/book/_build/.doctrees/intro.doctree
diff --git a/book/_build/html/.buildinfo b/book/_build/html/.buildinfo
@@ -0,0 +1,4 @@
+# Sphinx build info version 1
+# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
+config: 6c09b7289980afc9b768bddc95e05034
+tags: 645f666f9bcd5a90fca523b33c5a78b7
diff --git a/book/_build/html/_images/banner.png b/book/_build/html/_images/banner.png
diff --git a/book/_build/html/_sources/appendix/benchmark_functions.md b/book/_build/html/_sources/appendix/benchmark_functions.md
@@ -0,0 +1,77 @@
+---
+jupytext:
+  formats: md:myst
+  text_representation:
+    extension: .md
+    format_name: myst
+    format_version: 0.13
+    jupytext_version: 1.11.5
+kernelspec:
+  display_name: Python 3
+  language: python
+  name: python3
+---
+
+# Benchmark functions
+
+## Without constraints
+
+### Rastrigin function
+
+$$f(x) = An + \sum_{i=1}^{n}(x_{1}^{2} - A\cos(2\pi x_{i}))$$
+
+where $A=10$.
+
+```{code-cell}
+from optymus.utils import plot_function, rastrigin_function
+
+f = rastrigin_function()
+plot_function(f, title='Rastrigin Function', min=-5.12, max=5.12)
+``` 
+
+### Ackley function
+
+$$f(x,y) = -20exp[-0.2\sqrt{0.5(x^{2}+y^{2})}]-exp(0.5(cos(2\pi x)+cos(2\pi y)))+exp(1)+20$$
+
+
+### Eggholder function
+
+$$f(x,y) = -(y+47)sin(\sqrt{|y+x/2+47|})-xsin(\sqrt{|x-(y+47)|})$$
+
+
+### Cross-in-tray function
+
+
+$$f(x,y) = -0.0001(|\sin(x)\sin(y)\exp(|100-\sqrt{x^{2}+y^{2}}\pi|)|+1)^{0.1}$$
+
+### Sphere function
+
+$$f(x) = \sum_{i=1}^{n}x_{i}^{2}$$
+
+### Rosenbrock function
+
+$$f(x,y) = (1-x)^{2}+100(y-x^{2})^{2}$$
+
+### Beale function
+
+$$f(x,y) = (1.5-x+xy)^{2}+(2.25-x+xy^{2})^{2}+(2.625-x+xy^{3})^{2}$$
+
+
+### Goldstein–Price function
+
+$$ f(x,y) = [1+(x+y+1)^{2}(19-14x+3x^{2}-14y+6xy+3y^{2})][30+(2x-3y)^{2}(18-32x+12x^{2}+48y-36xy+27y^{2})]$$
+
+### Booth function
+
+$$f(x,y) = (x+2y-7)^{2}+(2x+y-5)^{2}$$
+
+### Styblinski–Tang function
+
+$$f(x) = \sum_{i=1}^{n}(x_{i}^{4}-16x_{i}^{2}+5x_{i})/2$$
+
+
+### McCormick funtion
+
+$$f(x,y) = \sin(x+y)+(x-y)^{2}-1.5x+2.5y+1$$
+
+## With constraints
diff --git a/book/_build/html/_sources/appendix/libraries.md b/book/_build/html/_sources/appendix/libraries.md
@@ -0,0 +1,2 @@
+# Optimization libraries
+
diff --git a/book/_build/html/_sources/appendix/math_review.md b/book/_build/html/_sources/appendix/math_review.md
@@ -0,0 +1,5 @@
+# Math review
+
+## Calculus
+
+## Linear algebra
diff --git a/book/_build/html/_sources/appendix/optymus.md b/book/_build/html/_sources/appendix/optymus.md
@@ -0,0 +1 @@
+# Optymus library
diff --git a/book/_build/html/_sources/chapters/ch1/challenges.md b/book/_build/html/_sources/chapters/ch1/challenges.md
@@ -0,0 +1,19 @@
+# Challenges in deep learning optimization
+
+Machine learning has avoided the difficulty of general optimization by carefully designing the objective function and constraints to ensure that the optimization problem is convex. Training neural networks usually confront the non-convex case.
+
+## Ill-conditioning
+
+## Local minima
+
+## Plateaus, saddle points, and flat regions
+
+## Cliffs and exploding gradients
+
+## Long-term dependencies
+
+## Inexact gradients
+
+## Poor correspondence between local and global structure
+
+## Theoretical limits of optimization
diff --git a/book/_build/html/_sources/chapters/ch1/differences.md b/book/_build/html/_sources/chapters/ch1/differences.md
@@ -0,0 +1,17 @@
+# The difference between learning and pure optimization
+
+In most machine learning scenarios, we define some performance measure $P$, defined for the test set and may be intractable. But in this case, we optimize $P$ only indirectly because we reduce a cost function $J(\theta)$ in the \textbf{hope} that doing so will optimize $P$. 
+
+For example, $P$ can be the binary accuracy measure as 
+
+$$\frac{TP+TN}{TP+TN+FP+FN}$$
+
+Typically, the cost function can be written as an average over the training set, 
+
+$$J(\theta)=\mathbb{E}_{(x,y)\sim \hat{p}_{data}}\mathcal{L}(f(x, \theta), y)$$
+
+where $\mathcal{L}$ is the loss function, $f(x,\theta)$ is the predicted output when the input is $x$, and $\hat{p}_{data}$ is the empirical distribution. Considering a supervised learning scenario, $y$ is the target output. 
+
+The equation defines an objective function for the training set. Considering the minimization scenario with the objective function have the expectation taking across the data generating distribution $p_{data}$ rather than just over the finite training set, we write the cost function as 
+
+$$J(\theta)=\mathbb{E}_{(x,y)\sim p_{data}}\mathcal{L}(f(x, \theta), y)$$
diff --git a/book/_build/html/_sources/chapters/ch1/introduction.md b/book/_build/html/_sources/chapters/ch1/introduction.md
@@ -0,0 +1,3 @@
+# Introduction
+
+The optimization for training deep learning focuses on one particular case of optimization: finding the parameters $\theta$ of a neural network that significantly reduces a cost function $J(\theta)$, which typically includes a performance measure evaluated on the entire training set as well as additional regularization terms.
diff --git a/book/_build/html/_sources/chapters/ch2/batch.md b/book/_build/html/_sources/chapters/ch2/batch.md
@@ -0,0 +1,15 @@
+# Batch and minibatch
+
+In machine learning algorithms the objective function usually decomposes as a sum over the training sample. Optimization algorithms for machine learning typically compute each update to the parameters based on the expected value of the cost function estimated using only a subset of the terms of the full cost function.
+
+Most of the properties of $J$ used by optimization algorithms are also expectations over the training set. The most commonly used property is the gradient 
+
+$$\nabla_{\theta}J(\theta)=\mathbb{E}_{x,y\sim\hat{p}_{data}}\nabla_{\theta}log~p_{model}(x,y;\theta)$$
+
+Optimization algorithms that use the entire training set are called **batch** or **deterministic** gradient methods. Optimization algorithms that use only a single example at a time are sometimes called **stochastic** or **online** gradient methods. Optimization algorithms that use some number between 2 and $n-1$ training examples are called **minibatch** methods. Is also common to call these methods as **stochastic**.
+
+A canonical example of minibatch is the stochastic gradient descent method. The minibatch sizes are generally driven by the following factors: (i) larger batches provide a more accurate estimate of the gradient; (ii) multicore architectures are usually underutilized by extremely small batches. 
+
+There are some hardware considerations about the batch size in GPU context. Small batches can offer a regularizing effect, perhaps due to the noise they add to the learning process.
+
+First-order methods are usually relatively robust and can handle smaller batch sizes like 100. Second-order methods typically require much larger batch sizes like 10.000. The minibatches must be selected randomly. A motivation for minibatch SGD is that it follows the gradient of the true _generalization error_ so long as no examples are repeated;
diff --git a/book/_build/html/_sources/chapters/ch2/important_concepts.md b/book/_build/html/_sources/chapters/ch2/important_concepts.md
@@ -0,0 +1 @@
+# Basic concepts
diff --git a/book/_build/html/_sources/chapters/ch2/stochastic_functions.md b/book/_build/html/_sources/chapters/ch2/stochastic_functions.md
@@ -0,0 +1,28 @@
+# Stochastic functions and optimization
+
+## Stochastic functions
+
+A **stochastic function** is a mathematical function that incorporates randomness in its output. In other words, for a given input, the function does not produce a single deterministic value but rather a probability distribution over possible values. This randomness can be due to various factors, such as measurement errors, inherent variability in the system being modeled, or deliberate introduction of noise.
+
+Consider the movement of a stock price over time. We could model this as a stochastic function:
+
+$$S(t) = S_0 + \mu t + \sigma W(t)$$
+
+- $S(t)$ represents the stock price at time t
+- $S_0$ is the initial stock price
+- $\mu$ is the average drift of the stock price
+- $\sigma$ is the volatility (standard deviation) of price changes
+- $W(t)$ is a Wiener process (a standard model for random fluctuations)
+
+The Wiener process introduces randomness, making the stock price evolution unpredictable at any given point in time.
+
+## Stochastic optimization
+
+Stochastic optimization methods are algorithms designed to solve optimization problems involving stochastic functions. These methods aim to find the optimal solution (e.g., minimum or maximum) of a function whose output is subject to random variations. 
+
+Here's how stochastic optimization methods generally work: 
+
+1. Sample the stochastic function: Since the function's output is random, we need to evaluate it multiple times for different realizations of the random variable ω. This provides us with a set of noisy observations of the function. 
+2. Use the observations to estimate the function's properties: Based on the sampled values, we can estimate the function's expected value, gradient, or other relevant characteristics. 
+3. Update the solution based on the estimated properties: Using the estimated information, we update the current solution towards the optimum. This update step often involves algorithms similar to those used in deterministic optimization, but with modifications to account for the noise. 
+4. Repeat steps 1-3 iteratively: The process of sampling, estimation, and update is repeated until convergence to a satisfactory solution is achieved.
diff --git a/book/_build/html/_sources/chapters/ch2/surrogate.md b/book/_build/html/_sources/chapters/ch2/surrogate.md
@@ -0,0 +1,10 @@
+# Surrogate loss function and early stopping
+
+Not rare, the loss function can't be optimized efficiently. We can use a **surrogate loss function** in this case.
+
+Some advantages of surrogate:
+
+- negative log-likelihood of the correct class is typically used as a surrogate for the 0-1 loss;
+- negative log-likelihood allows the model to estimate the conditional probability of the classes, given the input, and if the model can do it well, then it picks the classes that yield the least classification error in expectation;
+
+An important difference between optimization in general and optimization as we use for training algorithms: (i) training algorithms do not usually halt at the local minimum; (ii) a machine learning algorithm usually minimizes a **surrogate loss function** but halts when a convergence criterion based on **early stopping** is satisfied; (iii) training often halts while the surrogate loss function still has large derivatives. This is very different from the pure optimization setting, where an optimization algorithm is considered to have converged when the gradient becomes very small.
diff --git a/book/_build/html/_sources/chapters/ch3/adaptative_learning.md b/book/_build/html/_sources/chapters/ch3/adaptative_learning.md
@@ -0,0 +1,11 @@
+# Adaptative learning methods
+
+## AdaGrad
+
+## RMSProp
+
+## Adam
+
+## Adamax
+
+## Yogi
diff --git a/book/_build/html/_sources/chapters/ch3/first_order.md b/book/_build/html/_sources/chapters/ch3/first_order.md
@@ -0,0 +1,11 @@
+# First-order methods
+
+## Gradient descent
+
+## Stochastic gradient descent
+
+## Conjugate gradients
+
+## BFGS
+
+## L-BFGS
diff --git a/book/_build/html/_sources/chapters/ch3/linear_search.md b/book/_build/html/_sources/chapters/ch3/linear_search.md
@@ -0,0 +1,8 @@
+# Linear search
+
+
+## Constant step
+
+## Bisection
+
+## Golden section
diff --git a/book/_build/html/_sources/chapters/ch3/momentum.md b/book/_build/html/_sources/chapters/ch3/momentum.md
@@ -0,0 +1,5 @@
+# Momentum 
+
+## Momentum
+
+## Nesterov momentum
diff --git a/book/_build/html/_sources/chapters/ch3/optimization_algorithms.md b/book/_build/html/_sources/chapters/ch3/optimization_algorithms.md
@@ -0,0 +1,18 @@
+# Optimization algorithms
+
+
+| Method                  | Zero-order | First-order | Second-order | Adaptative learning |
+| ----------------------- | ---------- | ----------- | ------------ | ----------- |
+| Univariant              | X          |             |              |                     |
+| Powell's                | X          |             |              |                     |
+| Gradient descent        |            | X           |              |                     |
+| SGD                 |            | X           |              |                     |
+| Conjugate gradients |            | X           |              |                     |
+| BFGS                |            | X           |              |                     |
+| L-BFGS              |            | X           |              |                     |
+| Newton-Raphson          |            |             | X            |                     |
+| AdaGrad             |            |             |              | X                   |
+| RMSProp             |            |             |              | X                   |
+| Adam                |            |             |              | X                   |
+| Adamax                  |            |             |              | X                   |
+| Yogi                    |            |             |              | X                   |
diff --git a/book/_build/html/_sources/chapters/ch3/second_order.md b/book/_build/html/_sources/chapters/ch3/second_order.md
@@ -0,0 +1,3 @@
+# Second-order methods
+
+## Newton-Raphson
diff --git a/book/_build/html/_sources/chapters/ch3/zero_order.md b/book/_build/html/_sources/chapters/ch3/zero_order.md
@@ -0,0 +1,5 @@
+# Zero-order methods
+
+## Univariant
+
+## Powell's method
diff --git a/book/_build/html/_sources/chapters/ch4/information_algorithms.md b/book/_build/html/_sources/chapters/ch4/information_algorithms.md
@@ -0,0 +1 @@
+# Information-theoretical algorithms
diff --git a/book/_build/html/_sources/chapters/ch5/experiments.md b/book/_build/html/_sources/chapters/ch5/experiments.md
@@ -0,0 +1,2 @@
+# Experiments
+
diff --git a/book/_build/html/_sources/chapters/conclusion.md b/book/_build/html/_sources/chapters/conclusion.md
@@ -0,0 +1 @@
+# Final remarks
diff --git a/book/_build/html/_sources/intro.md b/book/_build/html/_sources/intro.md
@@ -0,0 +1,24 @@
+# Welcome
+
+As primeiras versões deste material foram inicidas no segundo semeste de 2023 quando cursei a disciplina de "Algoritmos de Otimização com Aplicações em Engenharia Mecânica", ministrada pelo professor Ivan Menezes (Department of Mechanical Engineering at PUC-Rio). No primeiro semestre de 2024 continuamos o nosso trabalho de investigação dos métodos de otimização mas agora com ênfase em otimização no processo de treinamento de deep learning. Paralelamente, a [optymus library](https://github.com/quant-sci/optymus) foi construída sob a orientação do professor Hélio Côrtes Vieira Lopes (Department of Informatics at PUC-Rio) e foi agregada como uma ferramenta de suporte para os métodos de otimização implmentados. 
+
+## Acknowledgments
+
+
+## How to cite
+
+```bibtex
+@book{optymus2024,
+    title = {Optimization in Deep Learning and Engineering},
+    author = {da Costa, K., Menezes, I., Lopes, H.,},
+    year = {2024},
+    url = {https://quantsci.org/odle-book}
+}
+```
+
+## Table of contents
+
+Check out the content pages bundled with this sample book to see more.
+
+```{tableofcontents}
+```
diff --git a/book/_build/html/_sphinx_design_static/design-style.1e8bd061cd6da7fc9cf755528e8ffc24.min.css b/book/_build/html/_sphinx_design_static/design-style.1e8bd061cd6da7fc9cf755528e8ffc24.min.css
diff --git a/book/_build/html/_sphinx_design_static/design-tabs.js b/book/_build/html/_sphinx_design_static/design-tabs.js
@@ -0,0 +1,27 @@
+var sd_labels_by_text = {};
+
+function ready() {
+  const li = document.getElementsByClassName("sd-tab-label");
+  for (const label of li) {
+    syncId = label.getAttribute("data-sync-id");
+    if (syncId) {
+      label.onclick = onLabelClick;
+      if (!sd_labels_by_text[syncId]) {
+        sd_labels_by_text[syncId] = [];
+      }
+      sd_labels_by_text[syncId].push(label);
+    }
+  }
+}
+
+function onLabelClick() {
+  // Activate other inputs with the same sync id.
+  syncId = this.getAttribute("data-sync-id");
+  for (label of sd_labels_by_text[syncId]) {
+    if (label === this) continue;
+    label.previousElementSibling.checked = true;
+  }
+  window.localStorage.setItem("sphinx-design-last-tab", syncId);
+}
+
+document.addEventListener("DOMContentLoaded", ready, false);