Weave on Self-Managed infrastructure is currently in Private Preview.For production environments, W&B strongly recommends using W&B Dedicated Cloud, where Weave is Generally Available.To deploy a production-grade, Self-Managed instance, contact support@wandb.com. Requirements
- W&B Platform installed. For more information, see the Self-Managed Deployment Guide.
- Bitnami’s ClickHouse Helm Chart.
- An S3 bucket pre-configured for ClickHouse storage. For configuration details, see Provide S3 Credentials.
- Kubernetes Cluster Nodes with the following specifications:
- CPU: 8 cores
- RAM: 64 GB
- Disk: 200GB+
 
- A Weave-enabled license from W&B. To request a license, please contact support@wandb.com.
The ClickHouse deployment in this document uses the Bitnami ClickHouse package.
The Bitnami Helm chart provides good support for basic ClickHouse functionalities, particularly the use of ClickHouse Keeper.
To configure Clickhouse, complete the following steps:
- Configure the Helm repository
- Create Helm Configuration
- Provide S3 credentials
- 
Add the Bitnami Helm repository:
helm repo add bitnami https://charts.bitnami.com/bitnami
- 
Update the repository:
helm repo update
Create Helm Configuration
The most critical part of the Helm configuration is the ClickHouse configuration, which is provided in XML format. Below is an example values.yaml file with customizable parameters to suit your needs.
To make the configuration process easier, we have added comments in the relevant sections using the format {/* COMMENT */}.
Modify the following parameters:
- clusterName
- auth.username
- auth.password
- S3 bucket-related configurations
W&B recommends keeping theclusterName value in values.yaml set to weave_cluster.  This is the expected cluster name when W&B Weave runs the database migration. If you need to use a different name, see the Setting clusterName section for more information.
## @param clusterName ClickHouse cluster name
clusterName: weave_cluster
## @param shards Number of ClickHouse shards to deploy
shards: 1
## @param replicaCount Number of ClickHouse replicas per shard to deploy
## if keeper enable, same as keeper count, keeper cluster by shards.
replicaCount: 3
persistence:
  enabled: true
  size: 30G # this size must be larger than cache size.
## ClickHouse resource requests and limits
resources:
  requests:
    cpu: 0.5
    memory: 500Mi
  limits:
    cpu: 3.0
    memory: 6Gi
## Authentication
auth:
  username: weave_admin
  password: "weave_123"
  existingSecret: ""
  existingSecretKey: ""
## @param logLevel Logging level
logLevel: information
## @section ClickHouse keeper configuration parameters
keeper:
  enabled: true
## @param extraEnvVars Array with extra environment variables to add to ClickHouse nodes
##
extraEnvVars:
  - name: S3_ENDPOINT
    value: "https://s3.us-east-1.amazonaws.com/bucketname/$(CLICKHOUSE_REPLICA_ID)"
## @param defaultConfigurationOverrides [string] Default configuration overrides (evaluated as a template)
defaultConfigurationOverrides: |
  <clickhouse>
    {/* Macros */}
    <macros>
      <shard from_env="CLICKHOUSE_SHARD_ID"></shard>
      <replica from_env="CLICKHOUSE_REPLICA_ID"></replica>
    </macros>
    {/* Log Level */}
    <logger>
      <level>{{ .Values.logLevel }}</level>
    </logger>
    {{- if or (ne (int .Values.shards) 1) (ne (int .Values.replicaCount) 1)}}
    <remote_servers>
      <{{ .Values.clusterName }}>
        {{- $shards := $.Values.shards | int }}
        {{- range $shard, $e := until $shards }}
        <shard>
          <internal_replication>true</internal_replication>
          {{- $replicas := $.Values.replicaCount | int }}
          {{- range $i, $_e := until $replicas }}
          <replica>
            <host>{{ printf "%s-shard%d-%d.%s.%s.svc.%s" (include "common.names.fullname" $ ) $shard $i (include "clickhouse.headlessServiceName" $) (include "common.names.namespace" $) $.Values.clusterDomain }}</host>
            <port>{{ $.Values.service.ports.tcp }}</port>
          </replica>
          {{- end }}
        </shard>
        {{- end }}
      </{{ .Values.clusterName }}>
    </remote_servers>
    {{- end }}
    {{- if .Values.keeper.enabled }}
    <keeper_server>
      <tcp_port>{{ $.Values.containerPorts.keeper }}</tcp_port>
      {{- if .Values.tls.enabled }}
      <tcp_port_secure>{{ $.Values.containerPorts.keeperSecure }}</tcp_port_secure>
      {{- end }}
      <server_id from_env="KEEPER_SERVER_ID"></server_id>
      <log_storage_path>/bitnami/clickhouse/keeper/coordination/log</log_storage_path>
      <snapshot_storage_path>/bitnami/clickhouse/keeper/coordination/snapshots</snapshot_storage_path>
      <coordination_settings>
        <operation_timeout_ms>10000</operation_timeout_ms>
        <session_timeout_ms>30000</session_timeout_ms>
        <raft_logs_level>trace</raft_logs_level>
      </coordination_settings>
      <raft_configuration>
        {{- $nodes := .Values.replicaCount | int }}
        {{- range $node, $e := until $nodes }}
        <server>
          <id>{{ $node | int }}</id>
          <hostname from_env="{{ printf "KEEPER_NODE_%d" $node }}"></hostname>
          <port>{{ $.Values.service.ports.keeperInter }}</port>
        </server>
        {{- end }}
      </raft_configuration>
    </keeper_server>
    {{- end }}
    {{- if or .Values.keeper.enabled .Values.zookeeper.enabled .Values.externalZookeeper.servers }}
    <zookeeper>
      {{- if or .Values.keeper.enabled }}
      {{- $nodes := .Values.replicaCount | int }}
      {{- range $node, $e := until $nodes }}
      <node>
        <host from_env="{{ printf "KEEPER_NODE_%d" $node }}"></host>
        <port>{{ $.Values.service.ports.keeper }}</port>
      </node>
      {{- end }}
      {{- else if .Values.zookeeper.enabled }}
      {{- $nodes := .Values.zookeeper.replicaCount | int }}
      {{- range $node, $e := until $nodes }}
      <node>
        <host from_env="{{ printf "KEEPER_NODE_%d" $node }}"></host>
        <port>{{ $.Values.zookeeper.service.ports.client }}</port>
      </node>
      {{- end }}
      {{- else if .Values.externalZookeeper.servers }}
      {{- range $node :=.Values.externalZookeeper.servers }}
      <node>
        <host>{{ $node }}</host>
        <port>{{ $.Values.externalZookeeper.port }}</port>
      </node>
      {{- end }}
      {{- end }}
    </zookeeper>
    {{- end }}
    {{- if .Values.metrics.enabled }}
    <prometheus>
      <endpoint>/metrics</endpoint>
      <port from_env="CLICKHOUSE_METRICS_PORT"></port>
      <metrics>true</metrics>
      <events>true</events>
      <asynchronous_metrics>true</asynchronous_metrics>
    </prometheus>
    {{- end }}
    <listen_host>0.0.0.0</listen_host>
    <listen_host>::</listen_host>
    <listen_try>1</listen_try>
    <storage_configuration>
      <disks>
        <s3_disk>
          <type>s3</type>
          <endpoint from_env="S3_ENDPOINT"></endpoint>
          {/* AVOID USE CREDENTIALS CHECK THE RECOMMENDATION */}
          <access_key_id>xxx</access_key_id>
          <secret_access_key>xxx</secret_access_key>
          {/* AVOID USE CREDENTIALS CHECK THE RECOMMENDATION */}
         <metadata_path>/var/lib/clickhouse/disks/s3_disk/</metadata_path>
        </s3_disk>
        <s3_disk_cache>
  	      <type>cache</type>
          <disk>s3_disk</disk>
          <path>/var/lib/clickhouse/s3_disk_cache/cache/</path>
          {/* THE CACHE SIZE MUST BE LOWER THAN PERSISTENT VOLUME */}
          <max_size>20Gi</max_size>
        </s3_disk_cache>
      </disks>
      <policies>
        <s3_main>
          <volumes>
            <main>
              <disk>s3_disk_cache</disk>
            </main>
          </volumes>
        </s3_main>
      </policies>
    </storage_configuration>
    <merge_tree>
      <storage_policy>s3_main</storage_policy>
    </merge_tree>
  </clickhouse>
## @section Zookeeper subchart parameters
zookeeper:
  enabled: false
S3 endpoint configuration
The bucket endpoint must be set as an environment variable to ensure each ClickHouse replica read and writes data in it’s folder in the bucket.
extraEnvVars:
  - name: S3_ENDPOINT
    value: "https://s3.us-east-1.amazonaws.com/bucketname/$(CLICKHOUSE_REPLICA_ID)"
Do not remove the $(CLICKHOUSE_REPLICA_ID) from the bucket endpoint configuration. It will ensure each ClickHouse replica is writing and reading data from it’s folder in the bucket.
Provide S3 credentials
You can specify credentials for accessing an S3 bucket by either hardcoding the configuration, or having ClickHouse fetch the data from environment variables or an EC2 instance.
Hardcode the configuration
Directly include the credentials in the storage configuration:
<type>s3</type>
<endpoint from_env="S3_ENDPOINT"></endpoint>
<access_key_id>xxx</access_key_id>
<secret_access_key>xxx</secret_access_key>
<use_environment_credentials>true</use_environment_credentials>
2. Install and deploy ClickHouse
With the repositories set up and the values.yaml file prepared, the next step is to install ClickHouse.
helm install clickhouse bitnami/clickhouse -f values.yaml --version 8.0.10
Ensure you’re using the version 8.0.10. The latest chart version (9.0.0) doesn’t work with the configuration proposed in this document.
3. Confirm ClickHouse deployment
Confirm that ClickHouse is deployed using the following command:
You should see the following pods:
NAME                                 READY   STATUS    RESTARTS   AGE
clickhouse-shard0-0                  1/1     Running   0          9m59s
clickhouse-shard0-1                  1/1     Running   0          10m
clickhouse-shard0-2                  1/1     Running   0          10m
4. Deploy Weave
Weave is already available for automatic deployment via W&B Operator. With the W&B Platform installed, complete the following steps:
- Edit the CR instance used to deploy the platform.
- Add the Weave configuration.
- Use Kubernetes service details to configure Weave tracing:
- Endpoint: <release-name>-headless.<namespace>.svc.cluster.local
- Replace <release-name>with your Helm release name
- Replace <namespace>with yourNAMESPACE
- Get the service details: kubectl get svc -n <namespace>
 
- Username: Set in the values.yaml
- Password: Set in the values.yaml
- 
With this information, update the W&B Platform Custom Resource(CR) by adding the following configuration:
apiVersion: apps.wandb.com/v1
kind: WeightsAndBiases
metadata:
  labels:
    app.kubernetes.io/name: weightsandbiases
    app.kubernetes.io/instance: wandb
  name: wandb
  namespace: default
spec:
  values:
    global:
    [...]
      clickhouse:
        host: <release-name>-headless.<namespace>.svc.cluster.local
        port: 8123
        password: <password>
        user: <username>
        database: wandb_weave
        # `replicated` must be set to `true` if replicating data across multiple nodes
        # This is in preview, use the env var `WF_CLICKHOUSE_REPLICATED`
        replicated: true
      weave-trace:
        enabled: true
    [...]
    weave-trace:
      install: true
      extraEnv:
        WF_CLICKHOUSE_REPLICATED: "true"
    [...]
 
When using more than one replica (W&B recommend a least 3 replicas), ensure to have the following environment variable set for Weave Traces.extraEnv:
  WF_CLICKHOUSE_REPLICATED: "true"
replicated: true which in preview. 
- 
Set the clusterNameinvalues.yamltoweave_cluster. If it is not, the database migration will fail.
Alternatively, ff you use a different cluster name, set theWF_CLICKHOUSE_REPLICATED_CLUSTERenvironment variable inweave-trace.extraEnvto match the chosen name, as shown in the example below.[...]
  clickhouse:
    host: <release-name>-headless.<namespace>.svc.cluster.local
    port: 8123
    password: <password>
    user: <username>
    database: wandb_weave
    # `replicated` must be set to `true` if replicating data across multiple nodes
    # This is in preview, use the env var `WF_CLICKHOUSE_REPLICATED`
    replicated: true
  weave-trace:
    enabled: true
[...]
weave-trace:
  install: true
  extraEnv:
    WF_CLICKHOUSE_REPLICATED: "true"
    WF_CLICKHOUSE_REPLICATED_CLUSTER: "different_cluster_name"
[...]
 
apiVersion: apps.wandb.com/v1
kind: WeightsAndBiases
metadata:
  labels:
    app.kubernetes.io/name: weightsandbiases
    app.kubernetes.io/instance: wandb
  name: wandb
  namespace: default
spec:
  values:
    global:
      license: eyJhbGnUzaHgyQjQyQWhEU3...ZieKQ2x5GGfw
      host: https://wandb.example.com
      bucket:
        name: abc-wandb-moving-pipefish
        provider: gcs
      mysql:
        database: wandb_local
        host: 10.218.0.2
        name: wandb_local
        password: 8wtX6cJHizAZvYScjDzZcUarK4zZGjpV
        port: 3306
        user: wandb
      clickhouse:
        host: <release-name>-headless.<namespace>.svc.cluster.local
        port: 8123
        password: <password>
        user: <username>
        database: wandb_weave
        # This option must be true if replicating data across multiple nodes
        replicated: true
      weave-trace:
        enabled: true
    ingress:
      annotations:
        ingress.gcp.kubernetes.io/pre-shared-cert: abc-wandb-cert-creative-puma
        kubernetes.io/ingress.class: gce
        kubernetes.io/ingress.global-static-ip-name: abc-wandb-operator-address
    weave-trace:
      install: true
      extraEnv:
        WF_CLICKHOUSE_REPLICATED: "true"
 
- 
With the Custom Resource (CR) prepared, apply the new configuration:
kubectl apply -f wandb.yaml
 
6. Access Weave
Once the deployment is running, accessing the W&B endpoint configured in the host option should display the Weave licensing status as enabled.