Skip to content

Commit

Permalink
Merge branch 'master' into patch/mage1
Browse files Browse the repository at this point in the history
  • Loading branch information
Sharpz7 committed Jun 28, 2023
2 parents b327ee4 + 32c79cc commit fe68a24
Show file tree
Hide file tree
Showing 219 changed files with 12,081 additions and 4,882 deletions.
8 changes: 4 additions & 4 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -453,24 +453,24 @@ workflows:
- release-armadactl:
filters:
tags:
only: /.*/
only: /v[0-9]+\.[0-9]+\.[0-9]+/
branches:
ignore: /.*/
- release-docker-images:
filters:
tags:
only: /.*/
only: /v[0-9]+\.[0-9]+\.[0-9]+/
branches:
ignore: /.*/
- release-charts:
filters:
tags:
only: /.*/
only: /v[0-9]+\.[0-9]+\.[0-9]+/
branches:
ignore: /.*/
- release-dotnet-client:
filters:
tags:
only: /.*/
only: /v[0-9]+\.[0-9]+\.[0-9]+/
branches:
ignore: /.*/
42 changes: 22 additions & 20 deletions .github/workflows/airflow-operator.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,39 +5,41 @@ on:
branches-ignore:
- master
paths:
- 'client/python/**'
- 'build/python-client/**'
- 'pkg/api/*.proto'
- '.github/workflows/airflow-operator.yml'
- '.github/workflows/python-client.yml'
- 'docs/python_armada_client.md'
- 'scripts/build-python-client.sh'
- 'third_party/airflow/**'
- '.github/workflows/python-tests/*'
- 'build/airflow-operator/**'
- 'pkg/api/jobservice/*.proto'
- '.github/workflows/airflow-operator.yml'
- 'build/python-client/**'
- 'client/python/**'
- 'docs/python_airflow_operator.md'
- 'scripts/build-airflow-operator.sh'
- 'docs/python_armada_client.md'
- 'internal/jobservice/*'
- 'makefile'
- '.github/workflows/python-tests/*'
- 'pkg/api/*.proto'
- 'pkg/api/jobservice/*.proto'
- 'scripts/build-airflow-operator.sh'
- 'scripts/build-python-client.sh'
- 'third_party/airflow/**'

pull_request:
branches-ignore:
- gh-pages
paths:
- 'client/python/**'
- 'build/python-client/**'
- 'pkg/api/*.proto'
- '.github/workflows/airflow-operator.yml'
- '.github/workflows/python-client.yml'
- 'docs/python_armada_client.md'
- 'scripts/build-python-client.sh'
- 'third_party/airflow/**'
- '.github/workflows/python-tests/*'
- 'build/airflow-operator/**'
- 'pkg/api/jobservice/*.proto'
- '.github/workflows/airflow-operator.yml'
- 'build/python-client/**'
- 'client/python/**'
- 'docs/python_airflow_operator.md'
- 'scripts/build-airflow-operator.sh'
- 'docs/python_armada_client.md'
- 'internal/jobservice/*'
- 'makefile'
- '.github/workflows/python-tests/*'
- 'pkg/api/*.proto'
- 'pkg/api/jobservice/*.proto'
- 'scripts/build-airflow-operator.sh'
- 'scripts/build-python-client.sh'
- 'third_party/airflow/**'

jobs:
airflow-tox:
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/go.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ jobs:
- name: golangci-lint
uses: golangci/golangci-lint-action@v3
with:
version: "latest"
version: "v1.53.1"
skip-pkg-cache: true
skip-build-cache: true
args: "-c ./.golangci.yml --timeout=10m --issues-exit-code=1 --max-issues-per-linter=0 --sort-results ./..."
Expand Down Expand Up @@ -63,4 +63,4 @@ jobs:
uses: actions/upload-artifact@v3.1.1
with:
name: junit.xml
path: test_reports/junit.xml
path: test_reports/junit.xml
4 changes: 4 additions & 0 deletions .golangci.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
issues:
max-issues-per-linter: 0
max-same-issues: 0
exclude-rules:
- path: internal/scheduler/schedulerobjects/podutils_test.go
linters:
- lll

output:
print-issued-lines: true
Expand Down
14 changes: 14 additions & 0 deletions .mergify.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
pull_request_rules:
- name: Require approval from Armada maintainers
conditions:
- "#approved-reviews-by>=1"
actions:
post_check:
success_conditions:
- or:
- "#approved-reviews-by>=2"
- and:
- "#approved-reviews-by>=1"
- "author~=^(JamesMurkin|severinson|d80tb7|carlocamurri|kannon92|dejanzele|Sharpz7|ClifHouck|robertdavidsmith|theAntiYeti|richscott|suprjinx|zuqq)"
title:
Two are checks required.
13 changes: 13 additions & 0 deletions client/DotNet/Armada.Client/ClientGenerated.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1230,6 +1230,19 @@ public partial class ApiJob
[Newtonsoft.Json.JsonProperty("scheduler", Required = Newtonsoft.Json.Required.Default, NullValueHandling = Newtonsoft.Json.NullValueHandling.Ignore)]
public string Scheduler { get; set; }

/// <summary>max(
///
/// sum across all containers,
/// max over all init containers,
///
/// )
///
/// This is because containers run in parallel, whereas initContainers run serially.
/// This field is populated automatically at submission.
/// Submitting a job with this field already populated results in an error.</summary>
[Newtonsoft.Json.JsonProperty("schedulingResourceRequirements", Required = Newtonsoft.Json.Required.Default, NullValueHandling = Newtonsoft.Json.NullValueHandling.Ignore)]
public V1ResourceRequirements SchedulingResourceRequirements { get; set; }

[Newtonsoft.Json.JsonProperty("services", Required = Newtonsoft.Json.Required.Default, NullValueHandling = Newtonsoft.Json.NullValueHandling.Ignore)]
public System.Collections.Generic.ICollection<ApiServiceConfig> Services { get; set; }

Expand Down
5 changes: 3 additions & 2 deletions client/python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,9 @@ license = { text = "Apache Software License" }
authors = [{ name = "G-Research Open Source Software", email = "armada@armadaproject.io" }]

[project.optional-dependencies]
format = ["black==23.3.0", "flake8==6.0.0", "pylint==2.17.3"]
docs = ["sphinx", "sphinx-jekyll-builder", "sphinx-toolbox==3.2.0b1"]
format = ["black==23.3.0", "flake8==6.0.0", "pylint==2.17.4"]
# note(JayF): sphinx-jekyll-builder was broken by sphinx-markdown-builder 0.6 -- so exclude it
docs = ["sphinx", "sphinx-jekyll-builder", "sphinx-toolbox==3.2.0b1", "sphinx-markdown-builder<0.6"]
test = ["pytest==7.3.1", "coverage>=6.5.0", "pytest-asyncio==0.21.0"]

[build-system]
Expand Down
10 changes: 1 addition & 9 deletions cmd/armada-load-tester/cmd/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,7 @@ var rootCmd = &cobra.Command{
Command line utility to submit many jobs to armada
Persistent config can be saved in a config file so it doesn't have to be specified every command.
Example structure:
armadaUrl: localhost:50051
basicAuth:
username: user1
password: password123
The location of this file can be passed in using --config argument or picked from $HOME/.armadactl.yaml.
`,
The location of this file can be passed in using --config argument or picked from $HOME/.armadactl.yaml.`,
}

// Execute adds all child commands to the root command and sets flags appropriately.
Expand Down
20 changes: 20 additions & 0 deletions cmd/armada/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"
"fmt"
"net/http"
_ "net/http/pprof"
"os"
"os/signal"
"syscall"
Expand Down Expand Up @@ -46,6 +47,25 @@ func main() {

log.Info("Starting...")

// Importing net/http/pprof automatically binds profiling endpoints to http.DefaultServeMux.
// Here, we create a new DefaultServeMux to ensure profiling is exposed on a separate mux.
// The profiling endpoints are only exposed if config.ProfilingPort is not nil.
pprofMux := http.DefaultServeMux
http.DefaultServeMux = http.NewServeMux()
if config.PprofPort != nil {
go func() {
server := &http.Server{
Addr: fmt.Sprintf("localhost:%d", *config.PprofPort),
Handler: pprofMux,
}
log := log.NewEntry(log.New())
log.Infof("profiling endpoints exposed on %s", server.Addr)
if err := server.ListenAndServe(); err != nil {
logging.WithStacktrace(log, err).Error("profiling server exited")
}
}()
}

// Run services within an errgroup to propagate errors between services.
g, ctx := errgroup.WithContext(context.Background())

Expand Down
48 changes: 44 additions & 4 deletions cmd/armadactl/cmd/scheduling.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,39 @@ func getSchedulingReportCmd(a *armadactl.App) *cobra.Command {
return initParams(cmd, a.Params)
},
RunE: func(cmd *cobra.Command, args []string) error {
return a.GetSchedulingReport()
verbosity, err := cmd.Flags().GetCount("verbose")
if err != nil {
return err
}

queueName, err := cmd.Flags().GetString("queue")
if err != nil {
return err
}
queueName = strings.TrimSpace(queueName)
if queueName != "" {
return a.GetSchedulingReportForQueue(queueName, int32(verbosity))
}

jobId, err := cmd.Flags().GetString("job")
if err != nil {
return err
}
jobId = strings.TrimSpace(jobId)
if jobId != "" {
return a.GetSchedulingReportForJob(jobId, int32(verbosity))
}

return a.GetSchedulingReport(int32(verbosity))
},
}

cmd.Flags().CountP("verbose", "v", "report verbosity; repeat (e.g., -vvv) to increase verbosity")

cmd.Flags().String("queue", "", "get scheduler reports relevant for this queue; mutually exclusive with --job")
cmd.Flags().String("job", "", "get scheduler reports relevant for this job; mutually exclusive with --queue")
cmd.MarkFlagsMutuallyExclusive("queue", "job")

return cmd
}

Expand All @@ -34,15 +64,25 @@ func getQueueSchedulingReportCmd(a *armadactl.App) *cobra.Command {
return initParams(cmd, a.Params)
},
RunE: func(cmd *cobra.Command, args []string) error {
queue, err := cmd.Flags().GetString("queue")
verbosity, err := cmd.Flags().GetCount("verbose")
if err != nil {
return err
}
queue = strings.TrimSpace(queue)
return a.GetQueueSchedulingReport(queue)

queueName, err := cmd.Flags().GetString("queue")
if err != nil {
return err
}
queueName = strings.TrimSpace(queueName)

return a.GetQueueSchedulingReport(queueName, int32(verbosity))
},
}

cmd.Flags().CountP("verbose", "v", "report verbosity; repeat (e.g., -vvv) to increase verbosity")

cmd.Flags().String("queue", "", "Queue name to query reports for.")

return cmd
}

Expand Down
7 changes: 0 additions & 7 deletions cmd/testsuite/cmd/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,6 @@ func RootCmd() *cobra.Command {
Long: `testsuite is a suite of automated tests for Armada deployments.
Persistent config can be saved in a config file so it doesn't have to be specified every command.
Example structure:
armadaUrl: localhost:50051
basicAuth:
username: user1
password: password123
The location of this file can be passed in using the --config argument.
If not provided, $HOME/.armadactl.yaml is used.`,
}
Expand Down
28 changes: 16 additions & 12 deletions config/armada/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,17 @@ eventsApiRedis:
poolSize: 1000
scheduling:
enableAssertions: true
fairnessModel: "AssetFairness"
dominantResourceFairnessResourcesToConsider:
- "cpu"
- "memory"
- "nvidia.com/gpu"
resourceScarcity:
cpu: 1.0
preemption:
nodeEvictionProbability: 1.0
nodeOversubscriptionEvictionProbability: 1.0
protectedFractionOfFairShare: 1.0
setNodeIdSelector: true
nodeIdLabel: kubernetes.io/hostname
setNodeName: false
Expand All @@ -42,8 +50,8 @@ scheduling:
priority: 1000
preemptible: false
maximumResourceFractionPerQueue:
memory: 0.99
cpu: 0.99
memory: 1.0
cpu: 1.0
armada-preemptible:
priority: 1000
preemptible: true
Expand All @@ -53,7 +61,7 @@ scheduling:
maxExtraNodesToConsider: 1
maximumResourceFractionToSchedule:
memory: 1.0
cpu: 1.0
cpu: 1.0
maxJobSchedulingContextsPerExecutor: 10000
lease:
expireAfter: 15m
Expand All @@ -68,11 +76,6 @@ scheduling:
value: "true"
effect: "NoSchedule"
defaultJobTolerationsByPriorityClass:
"":
- key: "armadaproject.io/pc-armada-default"
operator: "Equal"
value: "true"
effect: "NoSchedule"
armada-default:
- key: "armadaproject.io/pc-armada-default"
operator: "Equal"
Expand All @@ -84,14 +87,14 @@ scheduling:
value: "true"
effect: "NoSchedule"
maxRetries: 5
resourceScarcity:
cpu: 1.0
maxPodSpecSizeBytes: 65535
minJobResources:
memory: 1Mi
indexedResources:
- cpu
- memory
- name: "cpu"
resolution: "100m"
- name: "memory"
resolution: "1Mi"
minTerminationGracePeriod: 1s
maxTerminationGracePeriod: 300s
queueManagement:
Expand All @@ -103,6 +106,7 @@ eventRetention:
retentionDuration: 336h
metrics:
refreshInterval: 5m
exposeSchedulingMetrics: true
pulsar:
URL: "pulsar://pulsar:6650"
jobsetEventsTopic: "events"
Expand Down
4 changes: 4 additions & 0 deletions config/executor/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ application:
deleteConcurrencyLimit: 2
useExecutorApi: false
useLegacyApi: true
jobLeaseRequestTimeout: "30s"
task:
utilisationReportingInterval: 1s
missingJobEventReconciliationInterval: 15s
Expand Down Expand Up @@ -58,6 +59,9 @@ kubernetes:
fatalPodSubmissionErrors:
- "admission webhook"
- "namespaces \".*\" not found"
stateChecks:
deadlineForSubmittedPodConsideredMissing: 15m
deadlineForActivePodConsideredMissing: 5m
pendingPodChecks:
deadlineForUpdates: 10m
deadlineForNodeAssignment: 5m
Expand Down
Loading

0 comments on commit fe68a24

Please sign in to comment.