diff --git a/.github/workflows/canary.yml b/.github/workflows/canary.yml
index 72243d5a6ca..8b32d9b33fa 100644
--- a/.github/workflows/canary.yml
+++ b/.github/workflows/canary.yml
@@ -54,7 +54,7 @@ jobs:
       - name: Checkout Oneflow-Inc/oneflow
         if: ${{ github.event.inputs.oneflow-ref == '' }}
         uses: actions/checkout@v2
-      - uses: Oneflow-Inc/get-oneflow@support-cu12
+      - uses: Oneflow-Inc/get-oneflow@add-nightly-date
         name: Build manylinux
         id: build-cuda
         with:
@@ -72,8 +72,8 @@ jobs:
           clean-ccache: true
           compute-platform: ${{ env.COMPUTE_PLATFORM }}
           python-versions: |
+            3.7
             3.8
-            3.10
       - name: Upload wheelhouse
         uses: ./.github/actions/upload_oss
         with:
diff --git a/.github/workflows/on_merge.yml b/.github/workflows/on_merge.yml
index 6cf96474110..f92f8e42a44 100644
--- a/.github/workflows/on_merge.yml
+++ b/.github/workflows/on_merge.yml
@@ -15,6 +15,6 @@ jobs:
     if: github.event.pull_request.merged == true
     runs-on: ubuntu-latest
     steps:
-      - uses: Oneflow-Inc/get-oneflow/update-benchmark-history@support-cu12
+      - uses: Oneflow-Inc/get-oneflow/update-benchmark-history@add-nightly-date
         name: Update benchmark history
         timeout-minutes: 10
diff --git a/.github/workflows/priv_release.yml b/.github/workflows/priv_release.yml
deleted file mode 100644
index 3d59b751f53..00000000000
--- a/.github/workflows/priv_release.yml
+++ /dev/null
@@ -1,37 +0,0 @@
-name: Priv Release
-
-on:
-  push:
-    branches:
-      - priv-release
-      - add-cu12-release
-  schedule:
-    # beijing: 12 pm.
-    # utc: 4 am.
-    - cron: "0 4 * * *"
-  workflow_dispatch:
-    inputs:
-      priv_branch:
-        required: false
-        default: "main"
-
-concurrency:
-  group: priv-release-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  release:
-    name: Release pip
-    permissions:
-      contents: read
-      pull-requests: write
-    uses: ./.github/workflows/release.yml
-    with:
-      is_priv: true
-      branch: ${{ inputs.priv_branch || 'main' }}
-    secrets:
-      ONEFLOW_PRIV_ORG: ${{ secrets.ONEFLOW_PRIV_ORG }}
-      ONEFLOW_PRIV_GH_TOKEN: ${{ secrets.ONEFLOW_PRIV_GH_TOKEN }}
-      ONEFLOW_PRIV_OSS_BUCKET: ${{ secrets.ONEFLOW_PRIV_OSS_BUCKET }}
-      OSS_ACCESS_KEY_ID: ${{ secrets.OSS_ACCESS_KEY_ID }}
-      OSS_ACCESS_KEY_SECRET: ${{ secrets.OSS_ACCESS_KEY_SECRET }}
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 0bb05ee0eb8..d90504d8d11 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -14,26 +14,6 @@ on:
       placeholder:
         description: "update .github/workflows/release.yml to config your build"
         required: false
-  workflow_call:
-    inputs:
-      is_priv:
-        required: true
-        type: boolean
-      branch:
-        required: false
-        type: string
-        default: "main"
-    secrets:
-      ONEFLOW_PRIV_ORG:
-        required: true
-      ONEFLOW_PRIV_GH_TOKEN:
-        required: true
-      ONEFLOW_PRIV_OSS_BUCKET:
-        required: true
-      OSS_ACCESS_KEY_ID:
-        required: true
-      OSS_ACCESS_KEY_SECRET:
-        required: true
 concurrency:
   group: release-${{ github.ref }}
   cancel-in-progress: ${{ github.ref != 'refs/heads/master' }}
@@ -51,19 +31,11 @@ jobs:
     steps:
       - name: Checkout Oneflow-Inc/oneflow
         uses: actions/checkout@v2
-        if: ${{ !inputs.is_priv }}
         with:
           ref: ${{ github.event.pull_request.head.sha }}
           repository: ${{github.event.pull_request.head.repo.full_name}}
-      - name: Checkout oneflow
-        uses: actions/checkout@v2
-        if: ${{ inputs.is_priv }}
-        with:
-          ref: ${{ inputs.branch }}
-          repository: ${{ secrets.ONEFLOW_PRIV_ORG }}/oneflow
-          token: ${{ secrets.ONEFLOW_PRIV_GH_TOKEN }}
-      - uses: Oneflow-Inc/get-oneflow/cache-complete/matrix/build@support-cu12
-        name: Find build cache
+      - uses: Oneflow-Inc/get-oneflow/cache-complete/matrix/build@add-nightly-date
+        name: find cache
         id: find-cache
         timeout-minutes: 5
         with:
@@ -74,9 +46,9 @@ jobs:
             release
           oneflow-src: ${{ env.ONEFLOW_SRC }}
           entries: |
-            cu122
-            cu121
             cu118
+            cu117
+            cu116
             cpu
       - name: Get current date
         id: date
@@ -87,17 +59,14 @@ jobs:
       MANYLINUX_CACHE_DIR: ~/manylinux-cache-dir/release/${{ matrix.entry }}
       WHEELHOUSE_DIR: manylinux_wheelhouse
       OSS_DIR: branch/${{ github.ref_name }}/${{ matrix.entry }}/${{ github.sha }}
-      GITHUB_REF_NAME: ${{ github.ref_name }}
-      GITHUB_SHA: ${{ github.sha }}
-      ONEFLOW_OSS_BUCKET: oneflow-staging
     needs: [generate-build-matrix]
     name: Staging Release
     timeout-minutes: 180
     runs-on: [self-hosted, linux, release]
-    if: github.repository == 'Oneflow-Inc/oneflow' || inputs.is_priv
+    if: github.repository == 'Oneflow-Inc/oneflow'
     strategy:
       fail-fast: false
-      max-parallel: 6
+      max-parallel: 5
       matrix: ${{ fromJson(needs.generate-build-matrix.outputs.matrix) }}
     steps:
       - name: Fix permissions
@@ -108,66 +77,10 @@ jobs:
           python3 -m pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
           python3 -m pip install -U setuptools wheel --user
           python3 -m pip install oss2  --user
-      - name: Checkout Oneflow-Inc/oneflow
-        uses: actions/checkout@v2
-        if: ${{ !inputs.is_priv }}
-        with:
-          ref: ${{ github.event.pull_request.head.sha }}
-          repository: ${{github.event.pull_request.head.repo.full_name}}
-      - name: Checkout private oneflow
-        uses: actions/checkout@v2
-        if: ${{ inputs.is_priv }}
-        with:
-          ref: ${{ inputs.branch }}
-          repository: ${{ secrets.ONEFLOW_PRIV_ORG }}/oneflow
-          token: ${{ secrets.ONEFLOW_PRIV_GH_TOKEN }}
-      - name: Checkout cutlass_extension
-        uses: actions/checkout@v2
-        if: ${{ inputs.is_priv }}
-        with:
-          repository: ${{ secrets.ONEFLOW_PRIV_ORG }}/cutlass-extension
-          token: ${{ secrets.ONEFLOW_PRIV_GH_TOKEN }}
-          path: cutlass-extension
-      - name: Set Private env
-        if: ${{ inputs.is_priv }}
-        run: |
-          GITHUB_SHA=$(git rev-parse HEAD)
-          echo "OSS_DIR=branch/${{ inputs.branch }}/${{ matrix.entry }}/${GITHUB_SHA}" >> $GITHUB_ENV
-          echo "GITHUB_REF_NAME=${{ inputs.branch }}" >> $GITHUB_ENV
-          echo "GITHUB_SHA=${GITHUB_SHA}" >> $GITHUB_ENV
-          echo "ONEFLOW_OSS_BUCKET=${{ secrets.ONEFLOW_PRIV_OSS_BUCKET }}" >> $GITHUB_ENV
-      - name: Print env
-        if: ${{ inputs.is_priv }}
-        run: |
-          env
-      - uses: Oneflow-Inc/get-oneflow@support-cu12
-        name: Build ${{ matrix.entry }}
-        if: ${{ matrix.entry =='cu118' || startsWith(matrix.entry, 'cu12') }}
-        with:
-          cmake-init-cache: ${{ env.ONEFLOW_SRC }}/cmake/caches/ci/release/cu118.cmake
-          build-script: ${{ env.ONEFLOW_SRC }}/ci/manylinux/build-gcc9.sh
-          oneflow-src: ${{ env.ONEFLOW_SRC }}
-          oneflow-build-env: manylinux
-          wheelhouse-dir: ${{ env.WHEELHOUSE_DIR }}
-          clear-wheelhouse-dir: true
-          self-hosted: true
-          compute-platform: ${{ matrix.entry }}
-          manylinux-cache-dir: ${{ env.MANYLINUX_CACHE_DIR }}
-          docker-run-use-system-http-proxy: false
-          docker-run-use-lld: false
-          retry-failed-build: true
-          clean-ccache: true
-          nightly: ${{ github.event_name == 'schedule' || github.ref == 'refs/heads/release/add_nightly_date_index'}}
-          nightly-date: ${{ needs.generate-build-matrix.outputs.formatted_date }}
-          use-nvidia-wheels: ${{ matrix.entry !='cu112' }}
-          python-versions: |
-            3.7
-            3.8
-            3.9
-            3.10
-      - uses: Oneflow-Inc/get-oneflow@support-cu12
+      - uses: actions/checkout@v2
+      - uses: Oneflow-Inc/get-oneflow@add-nightly-date
         name: Build ${{ matrix.entry }}
-        if: ${{ startsWith(matrix.entry, 'cu') && matrix.entry !='cu118' && !startsWith(matrix.entry, 'cu12') }}
+        if: ${{ matrix.entry !='cpu' }}
         with:
           cmake-init-cache: ${{ env.ONEFLOW_SRC }}/cmake/caches/ci/release/cuda.cmake
           build-script: ${{ env.ONEFLOW_SRC }}/ci/manylinux/build-gcc9.sh
@@ -190,7 +103,7 @@ jobs:
             3.8
             3.9
             3.10
-      - uses: Oneflow-Inc/get-oneflow@support-cu12
+      - uses: Oneflow-Inc/get-oneflow@add-nightly-date
         name: Build ${{ matrix.entry }}
         if: ${{ matrix.entry =='cpu' }}
         with:
@@ -218,7 +131,7 @@ jobs:
         uses: ./.github/actions/upload_oss
         with:
           src_path: ${{ env.WHEELHOUSE_DIR }}
-          oss_dst_path: oss://${{ env.ONEFLOW_OSS_BUCKET }}/${{ env.OSS_DIR }}
+          oss_dst_path: oss://oneflow-staging/${{ env.OSS_DIR }}
           oss_access_key_id: ${{ secrets.OSS_ACCESS_KEY_ID }}
           oss_access_key_secret: ${{ secrets.OSS_ACCESS_KEY_SECRET }}
       - name: Update pip index
@@ -228,13 +141,13 @@ jobs:
         run: |
           python3 -m pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
           python3 -m pip install oss2 beautifulsoup4 --user
-          python3 tools/create_pip_index.py --dir_key ${{ env.OSS_DIR }} -b ${{ env.ONEFLOW_OSS_BUCKET }} \
-            --index_key=branch/${{ env.GITHUB_REF_NAME }}/${{ matrix.entry }}/index.html \
-            --index_key=branch/${{ env.GITHUB_REF_NAME }}/date/${{ needs.generate-build-matrix.outputs.formatted_date }}/${{ matrix.entry }}/index.html \
+          python3 tools/create_pip_index.py --dir_key ${{ env.OSS_DIR }} -b oneflow-staging \
+            --index_key=branch/${{ github.ref_name }}/${{ matrix.entry }}/index.html \
+            --index_key=branch/${{ github.ref_name }}/date/${{ needs.generate-build-matrix.outputs.formatted_date }}/${{ matrix.entry }}/index.html \
             --index_key=${{ env.OSS_DIR }}/index.html \
-            --index_key=commit/${{ env.GITHUB_SHA }}/${{ matrix.entry }}/index.html
+            --index_key=commit/${{ github.sha }}/${{ matrix.entry }}/index.html
       - name: Update API docs
-        if: github.ref == 'refs/heads/master' && matrix.entry == 'cpu' && !inputs.is_priv
+        if: github.ref == 'refs/heads/master' && matrix.entry == 'cpu'
         env:
           READTHEDOCS_TOKEN: ${{ secrets.READTHEDOCS_TOKEN }}
         run: |
diff --git a/.github/workflows/simple.yml b/.github/workflows/simple.yml
index 3668e422154..036b0330557 100644
--- a/.github/workflows/simple.yml
+++ b/.github/workflows/simple.yml
@@ -244,7 +244,7 @@ jobs:
           repository: Oneflow-Inc/conda-env
           ref: 30a7f00eb48ee9009d85a848e720823e5054c66b
           path: conda-env
-      - uses: Oneflow-Inc/get-oneflow@support-cu12
+      - uses: Oneflow-Inc/get-oneflow@add-nightly-date
         name: Build with gcc7
         if: ${{ matrix.build-type == 'gcc7'}}
         with:
@@ -253,7 +253,7 @@ jobs:
           oneflow-build-env: conda
           conda-env-file: conda-env/dev/gcc7/environment-v2.yml
           conda-env-name: oneflow-dev-gcc7-v2
-      - uses: Oneflow-Inc/get-oneflow@support-cu12
+      - uses: Oneflow-Inc/get-oneflow@add-nightly-date
         name: Build with clang10
         if: ${{ matrix.build-type == 'clang10'}}
         with:
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 8ee44d87762..8463bc730bb 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -176,7 +176,7 @@ jobs:
         with:
           ref: ${{ github.event.pull_request.head.sha }}
           repository: ${{github.event.pull_request.head.repo.full_name}}
-      - uses: Oneflow-Inc/get-oneflow/cache-complete/matrix/build@support-cu12
+      - uses: Oneflow-Inc/get-oneflow/cache-complete/matrix/build@add-nightly-date
         name: find cache
         id: find-cache
         timeout-minutes: 5
@@ -219,7 +219,7 @@ jobs:
         with:
           ref: ${{ github.event.pull_request.head.sha }}
           repository: ${{github.event.pull_request.head.repo.full_name}}
-      - uses: Oneflow-Inc/get-oneflow/cache-complete@support-cu12
+      - uses: Oneflow-Inc/get-oneflow/cache-complete@add-nightly-date
         name: Save cache if successful
         id: save-cache
         timeout-minutes: 5
@@ -233,7 +233,7 @@ jobs:
         run: |
           echo "::error file=test.yml,line=204,col=10::steps.save-cache.outputs.cache-hit != matrix.cache-hit"
           exit 1
-      - uses: Oneflow-Inc/get-oneflow@support-cu12
+      - uses: Oneflow-Inc/get-oneflow@add-nightly-date
         name: Build manylinux ${{ matrix.entry }}
         id: build-cpu
         if: ${{ matrix.entry =='cpu' && !matrix.cache-hit }}
@@ -255,7 +255,7 @@ jobs:
           python-versions: |
             3.7
             3.8
-      - uses: Oneflow-Inc/get-oneflow@support-cu12
+      - uses: Oneflow-Inc/get-oneflow@add-nightly-date
         name: Build manylinux ${{ matrix.entry }}
         id: build-cpu-sanitizers
         if: ${{ (matrix.entry == 'cpu-asan-ubsan' || matrix.entry == 'cpu-tsan') && !matrix.cache-hit && false }}
@@ -276,7 +276,7 @@ jobs:
           clean-ccache: ${{ contains(github.event.pull_request.labels.*.name, 'need-clean-ccache') }}
           python-versions: |
             3.8
-      - uses: Oneflow-Inc/get-oneflow@support-cu12
+      - uses: Oneflow-Inc/get-oneflow@add-nightly-date
         name: Build manylinux ${{ matrix.entry }}
         id: build-cuda
         if: ${{ matrix.entry =='cu116' && !matrix.cache-hit }}
@@ -296,7 +296,7 @@ jobs:
           clean-ccache: ${{ contains(github.event.pull_request.labels.*.name, 'need-clean-ccache') }}
           python-versions: |
             3.7
-      - uses: Oneflow-Inc/get-oneflow@support-cu12
+      - uses: Oneflow-Inc/get-oneflow@add-nightly-date
         name: Build ${{ matrix.entry }}
         if: ${{ matrix.entry == 'llvm15' && !matrix.cache-hit }}
         with:
@@ -335,7 +335,7 @@ jobs:
             })
       - name: Upload packed liboneflow
         if: ${{ !fromJson(matrix.cache-hit) && matrix.entry != 'llvm15' && matrix.entry != 'cpu-asan-ubsan' && matrix.entry != 'cpu-tsan' }}
-        uses: Oneflow-Inc/get-oneflow/digest/upload@support-cu12
+        uses: Oneflow-Inc/get-oneflow/digest/upload@add-nightly-date
         timeout-minutes: 10
         with:
           digest: ${{ steps.save-cache.outputs.build-digest }}
@@ -346,7 +346,7 @@ jobs:
           dst-dir: cpack
       - name: Upload whl
         if: ${{ !fromJson(matrix.cache-hit) && matrix.entry != 'llvm15' && matrix.entry != 'cpu-asan-ubsan' && matrix.entry != 'cpu-tsan' }}
-        uses: Oneflow-Inc/get-oneflow/digest/upload@support-cu12
+        uses: Oneflow-Inc/get-oneflow/digest/upload@add-nightly-date
         timeout-minutes: 10
         with:
           digest: ${{ steps.save-cache.outputs.build-digest }}
@@ -371,7 +371,7 @@ jobs:
         with:
           ref: ${{ github.event.pull_request.head.sha }}
           repository: ${{github.event.pull_request.head.repo.full_name}}
-      - uses: Oneflow-Inc/get-oneflow/cache-complete/matrix/test@support-cu12
+      - uses: Oneflow-Inc/get-oneflow/cache-complete/matrix/test@add-nightly-date
         name: find cache
         id: find-cache
         timeout-minutes: 5
@@ -402,7 +402,7 @@ jobs:
         with:
           ref: ${{ github.event.pull_request.head.sha }}
           repository: ${{github.event.pull_request.head.repo.full_name}}
-      - uses: Oneflow-Inc/get-oneflow/cache-complete/matrix/test@support-cu12
+      - uses: Oneflow-Inc/get-oneflow/cache-complete/matrix/test@add-nightly-date
         name: find cache
         id: find-cache
         timeout-minutes: 5
@@ -484,7 +484,7 @@ jobs:
         if: ${{ contains(matrix.runs-on, 'self-hosted') }}
         run: |
           docker rm -f ${{ env.TEST_CONTAINER_NAME }} || true
-      - uses: Oneflow-Inc/get-oneflow/cache-complete@support-cu12
+      - uses: Oneflow-Inc/get-oneflow/cache-complete@add-nightly-date
         name: Save cache if successful
         id: save-cache
         timeout-minutes: 5
@@ -500,7 +500,7 @@ jobs:
           exit 1
       - name: Download wheel and packed liboneflow
         if: ${{ !fromJson(matrix.cache-hit) && contains(matrix.runs-on, 'self-hosted') }}
-        uses: Oneflow-Inc/get-oneflow/digest/download@support-cu12
+        uses: Oneflow-Inc/get-oneflow/digest/download@add-nightly-date
         id: download-digest
         timeout-minutes: 10
         with:
@@ -510,7 +510,7 @@ jobs:
           ssh-tank-path: ${{ env.SSH_TANK_PATH }}
       - name: Get primary node
         if: ${{ !fromJson(matrix.cache-hit) && contains(matrix.runs-on, 'self-hosted') }}
-        uses: Oneflow-Inc/get-oneflow/master-address@support-cu12
+        uses: Oneflow-Inc/get-oneflow/master-address@add-nightly-date
         id: get-primary-node
         with:
           rank: ${{ matrix.rank }}
@@ -710,7 +710,7 @@ jobs:
         if: ${{ contains(matrix.runs-on, 'self-hosted') }}
         run: |
           docker rm -f ${{ env.TEST_MANYLINUX_CONTAINER_NAME }} || true
-      - uses: Oneflow-Inc/get-oneflow/cache-complete@support-cu12
+      - uses: Oneflow-Inc/get-oneflow/cache-complete@add-nightly-date
         name: Save cache if successful
         id: save-cache
         timeout-minutes: 5
@@ -726,7 +726,7 @@ jobs:
           exit 1
       - name: Download wheel and packed liboneflow
         if: ${{ !fromJson(matrix.cache-hit) && contains(matrix.runs-on, 'self-hosted') }}
-        uses: Oneflow-Inc/get-oneflow/digest/download@support-cu12
+        uses: Oneflow-Inc/get-oneflow/digest/download@add-nightly-date
         id: download-digest
         timeout-minutes: 10
         with:
@@ -736,7 +736,7 @@ jobs:
           ssh-tank-path: ${{ env.SSH_TANK_PATH }}
       - name: Download ASAN and UBSAN wheel and packed liboneflow
         if: ${{ !fromJson(matrix.cache-hit) && contains(matrix.runs-on, 'self-hosted') && matrix.device == 'cpu' && false }}
-        uses: Oneflow-Inc/get-oneflow/digest/download@support-cu12
+        uses: Oneflow-Inc/get-oneflow/digest/download@add-nightly-date
         id: asan-ubsan-download-digest
         timeout-minutes: 10
         with:
@@ -746,7 +746,7 @@ jobs:
           ssh-tank-path: ${{ env.SSH_TANK_PATH }}
       - name: Download TSAN wheel and packed liboneflow
         if: ${{ !fromJson(matrix.cache-hit) && contains(matrix.runs-on, 'self-hosted') && matrix.device == 'cpu' && false }}
-        uses: Oneflow-Inc/get-oneflow/digest/download@support-cu12
+        uses: Oneflow-Inc/get-oneflow/digest/download@add-nightly-date
         id: tsan-download-digest
         timeout-minutes: 10
         with:
@@ -934,7 +934,7 @@ jobs:
               issue_number: context.issue.number,
               owner: context.repo.owner,
               repo: context.repo.repo,
-              body: "View latest API docs preview at: https://oneflow-staging.oss-cn-beijing.aliyuncs.com/${{ env.DOCS_PATH }}/"
+              body: "View latest API docs preview at: https://staging.oneflow.info/${{ env.DOCS_PATH }}/"
             })
       - name: Doctest
         timeout-minutes: 45
@@ -1072,7 +1072,7 @@ jobs:
       - name: Benchmark Test
         timeout-minutes: 100
         if: ${{ !fromJson(matrix.cache-hit) && matrix.test-type == 'benchmark' && matrix.device == 'cuda' }}
-        uses: Oneflow-Inc/get-oneflow/pytest-benchmark@support-cu12
+        uses: Oneflow-Inc/get-oneflow/pytest-benchmark@add-nightly-date
         with:
           collect-path: ${{ env.FLOW_VISION_SRC }}/benchmark
           container-name: ${{ env.TEST_CONTAINER_NAME }}
@@ -1133,7 +1133,7 @@ jobs:
           ref: ${{ github.event.pull_request.head.sha }}
           repository: ${{github.event.pull_request.head.repo.full_name}}
           fetch-depth: 0
-      - uses: Oneflow-Inc/get-oneflow/cache-complete@support-cu12
+      - uses: Oneflow-Inc/get-oneflow/cache-complete@add-nightly-date
         name: Save cache if successful
         id: save-cache
         timeout-minutes: 5
diff --git a/README.md b/README.md
index 4d4b127ed3e..e01af3989f1 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,6 @@
 # OneFlow
 
 OneFlow is a deep learning framework designed to be **user-friendly, scalable and efficient**. With OneFlow, it is easy to:
-
 - program a model with [**PyTorch-like API**](https://oneflow.readthedocs.io/en/master/)
 - scale a model to n-dimensional-parallel execution with the [**Global Tensor**](https://docs.oneflow.org/en/master/cookies/global_tensor.html)
 - accelerate/deploy a model with the [**Graph Compiler**](https://oneflow.readthedocs.io/en/master/graph.html).
@@ -61,23 +60,24 @@ OneFlow is a deep learning framework designed to be **user-friendly, scalable an
 - To install nightly release of OneFlow with CUDA support:
 
   ```bash
-  python3 -m pip install --pre oneflow -f https://oneflow-staging.oss-cn-beijing.aliyuncs.com/branch/master/cu118
+  python3 -m pip install --pre oneflow -f https://staging.oneflow.info/branch/master/cu117
   ```
 
 - To install other available builds for different variants:
 
   - Stable
     ```bash
-    python3 -m pip install --find-links https://release.oneflow.info oneflow==0.9.0+cu118
+    python3 -m pip install --find-links https://release.oneflow.info oneflow==0.9.0+cu117
     ```
   - Nightly
     ```
-    python3 -m pip install --pre oneflow -f https://oneflow-staging.oss-cn-beijing.aliyuncs.com/branch/master/[PLATFORM]
+    python3 -m pip install --pre oneflow -f https://staging.oneflow.info/branch/master/[PLATFORM]
     ```
   - All available `[PLATFORM]`:
     | Platform |CUDA Driver Version| Supported GPUs |
     |---|---|---|
-    | cu118 | >= 450.80.02 | GTX 10xx, RTX 20xx, A100, RTX 30xx |
+    | cu117 | >= 450.80.02 | GTX 10xx, RTX 20xx, A100, RTX 30xx |
+    | cu102 | >= 440.33 | GTX 10xx, RTX 20xx |
     | cpu | N/A | N/A |
 
 - If you are in China, you could run this to have pip download packages from domestic mirror of pypi:
diff --git a/cmake/caches/ci/release/cu118.cmake b/cmake/caches/ci/release/cu118.cmake
deleted file mode 100644
index 270afb4409e..00000000000
--- a/cmake/caches/ci/release/cu118.cmake
+++ /dev/null
@@ -1,17 +0,0 @@
-set(BUILD_CUDA YES CACHE BOOL "")
-set(BUILD_GIT_VERSION YES CACHE BOOL "")
-set(BUILD_TESTING OFF CACHE BOOL "")
-set(BUILD_RDMA YES CACHE BOOL "")
-set(TREAT_WARNINGS_AS_ERRORS YES CACHE BOOL "")
-set(THIRD_PARTY_MIRROR aliyun CACHE STRING "")
-set(PIP_INDEX_MIRROR "https://pypi.tuna.tsinghua.edu.cn/simple" CACHE STRING "")
-set(CMAKE_BUILD_TYPE Release CACHE STRING "")
-set(CMAKE_GENERATOR Ninja CACHE STRING "")
-set(CMAKE_CUDA_ARCHITECTURES "70-real;80-real;86-real;89-real;90-real" CACHE STRING "")
-set(CUDNN_STATIC OFF CACHE BOOL "")
-set(WITH_MLIR ON CACHE BOOL "")
-set(BUILD_CPP_API OFF CACHE BOOL "")
-set(CUDA_NVCC_THREADS_NUMBER 2 CACHE STRING "")
-set(CMAKE_C_COMPILER_LAUNCHER ccache CACHE STRING "")
-set(CMAKE_CXX_COMPILER_LAUNCHER ccache CACHE STRING "")
-set(CMAKE_CUDA_COMPILER_LAUNCHER ccache CACHE STRING "")
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 7471b37f813..a72b2dda50c 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -5,5 +5,5 @@ furo==2021.4.11b34
 sphinx-copybutton==0.5.0
 # above are dev dependencies
 --pre
---find-links https://oneflow-staging.oss-cn-beijing.aliyuncs.com/branch/master/cpu
+--find-links https://staging.oneflow.info/branch/master/cpu
 oneflow
diff --git a/docs/source/nn.functional.rst b/docs/source/nn.functional.rst
index 6ba341196cb..971dc768749 100644
--- a/docs/source/nn.functional.rst
+++ b/docs/source/nn.functional.rst
@@ -75,7 +75,6 @@ Non-linear activation functions
     selu
     celu
     leaky_relu
-    square_relu
     prelu
     glu
     gelu
diff --git a/docs/source/nn.rst b/docs/source/nn.rst
index 62e52657e86..8cd49bc1d49 100644
--- a/docs/source/nn.rst
+++ b/docs/source/nn.rst
@@ -160,7 +160,6 @@ Non-linear Activations (weighted sum, nonlinearity)
     nn.CELU 
     nn.GELU 
     nn.QuickGELU 
-    nn.SquareReLU
     nn.SiLU 
     nn.Sigmoid 
     nn.Mish 
diff --git a/docs/source/oneflow.rst b/docs/source/oneflow.rst
index 592d3bd6081..972963909bf 100644
--- a/docs/source/oneflow.rst
+++ b/docs/source/oneflow.rst
@@ -270,7 +270,6 @@ Pointwise Ops
     fmod 
     gelu
     quick_gelu
-    square_relu
     log 
     log1p 
     log2 
diff --git a/oneflow/api/python/framework/device.cpp b/oneflow/api/python/framework/device.cpp
index df7278a2dd1..445b953aac4 100644
--- a/oneflow/api/python/framework/device.cpp
+++ b/oneflow/api/python/framework/device.cpp
@@ -31,10 +31,9 @@ ONEFLOW_API_PYBIND11_MODULE("", m) {
       .def(py::init([](const std::string& type_or_type_with_device_id) {
         return Device::ParseAndNew(type_or_type_with_device_id).GetOrThrow();
       }))
-      .def(py::init([](const std::string& type, int64_t index) {
-             return Device::New(type, index).GetOrThrow();
-           }),
-           py::arg("type"), py::arg("index"))
+      .def(py::init([](const std::string& type, int64_t device_id) {
+        return Device::New(type, device_id).GetOrThrow();
+      }))
       .def(py::init([](const Symbol<Device>& other_device) { return other_device; }))
       .def_property_readonly("type", [](const Symbol<Device>& d) { return d->type(); })
       .def_property_readonly("index", [](const Symbol<Device>& d) { return d->device_id(); })
diff --git a/oneflow/core/autograd/gradient_funcs/activation.cpp b/oneflow/core/autograd/gradient_funcs/activation.cpp
index 2f388e94cee..03db0d7f49c 100644
--- a/oneflow/core/autograd/gradient_funcs/activation.cpp
+++ b/oneflow/core/autograd/gradient_funcs/activation.cpp
@@ -152,36 +152,6 @@ class QuickGeLU : public OpExprGradFunction<QuickGeluCaptureState> {
   }
 };
 
-struct SquareReLUCaptureState : public AutoGradCaptureState {
-  bool requires_grad = false;
-};
-
-class SquareReLU : public OpExprGradFunction<SquareReLUCaptureState> {
- public:
-  Maybe<void> Init(const OpExpr& op) override { return Maybe<void>::Ok(); }
-
-  Maybe<void> Capture(SquareReLUCaptureState* ctx, const TensorTuple& inputs,
-                      const TensorTuple& outputs, const AttrMap& attrs) const override {
-    CHECK_EQ_OR_RETURN(inputs.size(), 1);   // NOLINT(maybe-need-error-msg)
-    CHECK_EQ_OR_RETURN(outputs.size(), 1);  // NOLINT(maybe-need-error-msg)
-    ctx->requires_grad = inputs.at(0)->requires_grad();
-    if (!ctx->requires_grad) { return Maybe<void>::Ok(); }
-    ctx->SaveTensorForBackward(inputs.at(0));
-    return Maybe<void>::Ok();
-  }
-
-  Maybe<void> Apply(const SquareReLUCaptureState* ctx, const TensorTuple& out_grads,
-                    TensorTuple* in_grads) const override {
-    CHECK_EQ_OR_RETURN(out_grads.size(), 1);  // NOLINT(maybe-need-error-msg)
-    in_grads->resize(1);
-    if (ctx->requires_grad) {
-      const auto& x = ctx->SavedTensors().at(0);
-      in_grads->at(0) = JUST(functional::SquareReLUGrad(out_grads.at(0), x));
-    }
-    return Maybe<void>::Ok();
-  }
-};
-
 class HardSigmoid : public BaseActivation {
  public:
   Maybe<void> Apply(const BaseActivationCaptureState* ctx, const TensorTuple& out_grads,
@@ -668,7 +638,6 @@ REGISTER_OP_EXPR_GRAD_FUNCTION("softplus", Softplus);
 REGISTER_OP_EXPR_GRAD_FUNCTION("softshrink", SoftShrink);
 REGISTER_OP_EXPR_GRAD_FUNCTION("fast_gelu", FastGeLU);
 REGISTER_OP_EXPR_GRAD_FUNCTION("quick_gelu", QuickGeLU);
-REGISTER_OP_EXPR_GRAD_FUNCTION("square_relu", SquareReLU);
 
 }  // namespace one
 }  // namespace oneflow
diff --git a/oneflow/core/ep/common/primitive/broadcast_elementwise_binary.h b/oneflow/core/ep/common/primitive/broadcast_elementwise_binary.h
index d3f20dce80e..c30835decc2 100644
--- a/oneflow/core/ep/common/primitive/broadcast_elementwise_binary.h
+++ b/oneflow/core/ep/common/primitive/broadcast_elementwise_binary.h
@@ -126,8 +126,7 @@ inline bool IsDimsEquals(size_t num_src0_dims, const int64_t* src0_dims, size_t
   OF_PP_MAKE_TUPLE_SEQ(BinaryOp::kTanhBackwardWithDyY)       \
   OF_PP_MAKE_TUPLE_SEQ(BinaryOp::kThresholdBackwardWithDyX)  \
   OF_PP_MAKE_TUPLE_SEQ(BinaryOp::kFastGeluBackwardWithDyX)   \
-  OF_PP_MAKE_TUPLE_SEQ(BinaryOp::kQuickGeluBackwardWithDyX)  \
-  OF_PP_MAKE_TUPLE_SEQ(BinaryOp::kSquareReLUBackwardWithDyX)
+  OF_PP_MAKE_TUPLE_SEQ(BinaryOp::kQuickGeluBackwardWithDyX)
 
 #define BINARY_ACTIVATION_BACKWARD_OP_SEQ \
   BINARY_ACTIVATION_BACKWARD_OP_SEQ_0     \
diff --git a/oneflow/core/ep/common/primitive/elementwise_unary.h b/oneflow/core/ep/common/primitive/elementwise_unary.h
index ec7651e1047..14fcce26feb 100644
--- a/oneflow/core/ep/common/primitive/elementwise_unary.h
+++ b/oneflow/core/ep/common/primitive/elementwise_unary.h
@@ -86,8 +86,7 @@ namespace primitive {
   OF_PP_MAKE_TUPLE_SEQ(UnaryOp::kNotEqualZero)    \
   OF_PP_MAKE_TUPLE_SEQ(UnaryOp::kNanAssign)       \
   OF_PP_MAKE_TUPLE_SEQ(UnaryOp::kFastGelu)        \
-  OF_PP_MAKE_TUPLE_SEQ(UnaryOp::kQuickGelu)       \
-  OF_PP_MAKE_TUPLE_SEQ(UnaryOp::kSquareReLU)
+  OF_PP_MAKE_TUPLE_SEQ(UnaryOp::kQuickGelu)
 
 #define UNARY_COMPLEX_C2C_OP_SEQ       \
   OF_PP_MAKE_TUPLE_SEQ(UnaryOp::kConj) \
diff --git a/oneflow/core/ep/cpu/primitive/binary_functor.h b/oneflow/core/ep/cpu/primitive/binary_functor.h
index 56422deedf6..d479a7a7409 100644
--- a/oneflow/core/ep/cpu/primitive/binary_functor.h
+++ b/oneflow/core/ep/cpu/primitive/binary_functor.h
@@ -309,16 +309,6 @@ struct BinaryFunctor<DeviceType::kCPU, BinaryOp::kQuickGeluBackwardWithDyX, Src,
   static constexpr Src alpha = static_cast<Src>(1.702);
 };
 
-template<typename Src, typename Dst>
-struct BinaryFunctor<DeviceType::kCPU, BinaryOp::kSquareReLUBackwardWithDyX, Src, Dst> {
-  OF_DEVICE_FUNC BinaryFunctor(Scalar attr0, Scalar attr1) {}
-
-  OF_DEVICE_FUNC Dst operator()(Src dy, Src x) const {
-    return static_cast<Dst>((x > static_cast<Src>(0.0)) ? static_cast<Src>(2.0) * x * dy
-                                                        : static_cast<Src>(0.0));
-  }
-};
-
 template<typename Src, typename Dst>
 struct BinaryFunctor<DeviceType::kCPU, BinaryOp::kTanhBackwardWithDyY, Src, Dst> {
   OF_DEVICE_FUNC BinaryFunctor(Scalar attr0, Scalar attr1) {}
diff --git a/oneflow/core/ep/cpu/primitive/unary_functor.h b/oneflow/core/ep/cpu/primitive/unary_functor.h
index 4745ff7cdc6..5c5a236df07 100644
--- a/oneflow/core/ep/cpu/primitive/unary_functor.h
+++ b/oneflow/core/ep/cpu/primitive/unary_functor.h
@@ -64,15 +64,6 @@ struct UnaryFunctor<DeviceType::kCPU, UnaryOp::kQuickGelu, Dst, Src> {
   static constexpr Src alpha = static_cast<Src>(1.702);
 };
 
-template<typename Dst, typename Src>
-struct UnaryFunctor<DeviceType::kCPU, UnaryOp::kSquareReLU, Dst, Src> {
-  OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) {}
-
-  OF_DEVICE_FUNC Dst operator()(Src src) const {
-    return static_cast<Dst>((src > static_cast<Src>(0.0)) ? src * src : 0);
-  }
-};
-
 template<typename Dst, typename Src>
 struct UnaryFunctor<DeviceType::kCPU, UnaryOp::kTanh, Dst, Src> {
   OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) {}
@@ -380,7 +371,6 @@ SPECIALIZATION_CPU_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kReciprocalNoNan);
 SPECIALIZATION_CPU_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kNotEqualZero);
 SPECIALIZATION_CPU_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kFastGelu);
 SPECIALIZATION_CPU_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kQuickGelu);
-SPECIALIZATION_CPU_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kSquareReLU);
 SPECIALIZATION_CPU_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kDigamma);
 SPECIALIZATION_CPU_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kTrigamma);
 
diff --git a/oneflow/core/ep/cuda/primitive/binary_functor.cuh b/oneflow/core/ep/cuda/primitive/binary_functor.cuh
index 29cf11cf6c4..fa360490659 100644
--- a/oneflow/core/ep/cuda/primitive/binary_functor.cuh
+++ b/oneflow/core/ep/cuda/primitive/binary_functor.cuh
@@ -150,16 +150,6 @@ struct BinaryFunctor<DeviceType::kCUDA, BinaryOp::kQuickGeluBackwardWithDyX, Src
   const Src alpha = static_cast<Src>(1.702);
 };
 
-template<typename Src, typename Dst>
-struct BinaryFunctor<DeviceType::kCUDA, BinaryOp::kSquareReLUBackwardWithDyX, Src, Dst> {
-  OF_DEVICE_FUNC BinaryFunctor(Scalar attr0, Scalar attr1) {}
-
-  OF_DEVICE_FUNC Dst operator()(Src dy, Src x) const {
-    return static_cast<Dst>((x > static_cast<Src>(0.0)) ? static_cast<Src>(2.0) * x * dy
-                                                        : static_cast<Src>(0.0));
-  }
-};
-
 template<typename Src, typename Dst>
 struct BinaryFunctor<DeviceType::kCUDA, BinaryOp::kTanhBackwardWithDyY, Src, Dst> {
   OF_DEVICE_FUNC BinaryFunctor(Scalar attr0, Scalar attr1) {}
@@ -415,7 +405,6 @@ SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kTanhBackwardWithDyY);
 SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kThresholdBackwardWithDyX);
 SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kFastGeluBackwardWithDyX);
 SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kQuickGeluBackwardWithDyX);
-SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kSquareReLUBackwardWithDyX);
 
 SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kAcosBackwardWithDyX);
 SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kAcoshBackwardWithDyX);
@@ -490,7 +479,6 @@ SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kThresholdBackwardWithDyX);
 SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kTanhBackwardWithDyY);
 SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kFastGeluBackwardWithDyX);
 SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kQuickGeluBackwardWithDyX);
-SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kSquareReLUBackwardWithDyX);
 
 SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kAcosBackwardWithDyX);
 SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kAcoshBackwardWithDyX);
diff --git a/oneflow/core/ep/cuda/primitive/unary_functor.cuh b/oneflow/core/ep/cuda/primitive/unary_functor.cuh
index 61441e31288..41dbffb23b0 100644
--- a/oneflow/core/ep/cuda/primitive/unary_functor.cuh
+++ b/oneflow/core/ep/cuda/primitive/unary_functor.cuh
@@ -70,15 +70,6 @@ struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kQuickGelu, Dst, Src> {
   static constexpr Src alpha = static_cast<Src>(1.702);
 };
 
-template<typename Dst, typename Src>
-struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kSquareReLU, Dst, Src> {
-  OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) {}
-
-  OF_DEVICE_FUNC Dst operator()(Src src) const {
-    return static_cast<Dst>((src > static_cast<Src>(0.0)) ? src * src : 0);
-  }
-};
-
 namespace unary_functor_internal {
 
 namespace {
@@ -500,7 +491,6 @@ SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kReciprocalNoNan);
 SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kNotEqualZero);
 SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kNanAssign);
 SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kQuickGelu);
-SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kSquareReLU);
 
 /*********nv_bfloat16_kernel*******/
 
@@ -568,7 +558,6 @@ SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kNotEqualZero);
 SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kNanAssign);
 SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kFastGelu);
 SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kQuickGelu);
-SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kSquareReLU);
 SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kDigamma);
 SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kTrigamma);
 
diff --git a/oneflow/core/ep/include/primitive/binary_op.h b/oneflow/core/ep/include/primitive/binary_op.h
index fea46ff4c3f..e505fdae1f4 100644
--- a/oneflow/core/ep/include/primitive/binary_op.h
+++ b/oneflow/core/ep/include/primitive/binary_op.h
@@ -109,7 +109,6 @@ enum class BinaryOp {
   kTanBackwardWithDyX,
   kFastGeluBackwardWithDyX,
   kQuickGeluBackwardWithDyX,
-  kSquareReLUBackwardWithDyX,
 };
 
 }
diff --git a/oneflow/core/ep/include/primitive/unary_op.h b/oneflow/core/ep/include/primitive/unary_op.h
index 62acd0276a0..cab540adb4a 100644
--- a/oneflow/core/ep/include/primitive/unary_op.h
+++ b/oneflow/core/ep/include/primitive/unary_op.h
@@ -43,7 +43,6 @@ enum class UnaryOp {
   kThreshold,
   kFastGelu,
   kQuickGelu,
-  kSquareReLU,
   // math op
   kAbs,
   kAcos,
diff --git a/oneflow/core/functional/functional_api.yaml b/oneflow/core/functional/functional_api.yaml
index 3323db0e93c..8fd34401c1b 100644
--- a/oneflow/core/functional/functional_api.yaml
+++ b/oneflow/core/functional/functional_api.yaml
@@ -755,14 +755,6 @@
   signature: "Tensor (Tensor dy, Tensor x) => QuickGeluGrad"
   bind_python: False
 
-- name: "square_relu"
-  signature: "Tensor (Tensor x) => SquareReLU"
-  bind_python: True
-
-- name: "square_relu_grad"
-  signature: "Tensor (Tensor dy, Tensor x) => SquareReLUGrad"
-  bind_python: False
-
 - name: "gelu_with_approximate"
   signature: 'Tensor (Tensor x, String approximate="none") => GeluWithApproximate'
   bind_python: True
diff --git a/oneflow/core/functional/impl/activation_functor.cpp b/oneflow/core/functional/impl/activation_functor.cpp
index b0446f79bd8..2aa2d19fb6e 100644
--- a/oneflow/core/functional/impl/activation_functor.cpp
+++ b/oneflow/core/functional/impl/activation_functor.cpp
@@ -247,21 +247,6 @@ class QuickGeluGradFunctor : public BinaryFunctor {
   }
 };
 
-class SquareReLUFunctor : public UnaryFunctor {
- public:
-  SquareReLUFunctor() {
-    op_ = CHECK_JUST(one::OpBuilder("square_relu").Input("x").Output("y").Build());
-  }
-};
-
-class SquareReLUGradFunctor : public BinaryFunctor {
- public:
-  SquareReLUGradFunctor() {
-    op_ =
-        CHECK_JUST(one::OpBuilder("square_relu_grad").Input("dy").Input("x").Output("dx").Build());
-  }
-};
-
 class GluFunctor {
  public:
   GluFunctor() {}
@@ -794,8 +779,6 @@ ONEFLOW_FUNCTION_LIBRARY(m) {
   m.add_functor<impl::FastGeluGradFunctor>("FastGeluGrad");
   m.add_functor<impl::QuickGeluFunctor>("QuickGelu");
   m.add_functor<impl::QuickGeluGradFunctor>("QuickGeluGrad");
-  m.add_functor<impl::SquareReLUFunctor>("SquareReLU");
-  m.add_functor<impl::SquareReLUGradFunctor>("SquareReLUGrad");
   m.add_functor<impl::GluFunctor>("Glu");
   m.add_functor<impl::HardSigmoidFunctor>("HardSigmoid");
   m.add_functor<impl::HardSigmoidGradFunctor>("HardSigmoidGrad");
diff --git a/oneflow/core/functional/impl/nn_functor.cpp b/oneflow/core/functional/impl/nn_functor.cpp
index ef4f2f92070..995cc937bdb 100644
--- a/oneflow/core/functional/impl/nn_functor.cpp
+++ b/oneflow/core/functional/impl/nn_functor.cpp
@@ -5223,8 +5223,6 @@ class GroupedMatmulFunctor {
   Maybe<TensorTuple> operator()(const TensorTuple& xs, const TensorTuple& weights) const {
     const int64_t input_size = xs.size();
     const int64_t weight_size = weights.size();
-    CHECK_LT_OR_RETURN(input_size, kMaxInputCount)
-        << Error::RuntimeError() << "input_size size should not be greater than 128";
     CHECK_GE_OR_RETURN(input_size, 1)
         << Error::RuntimeError() << "The number of xs should be greater equal than 1.";
     CHECK_EQ_OR_RETURN(weight_size, input_size)
diff --git a/oneflow/core/graph/stream_id.cpp b/oneflow/core/graph/stream_id.cpp
index 2025ee676e1..cc55718fd4e 100644
--- a/oneflow/core/graph/stream_id.cpp
+++ b/oneflow/core/graph/stream_id.cpp
@@ -20,9 +20,9 @@ namespace oneflow {
 
 // StreamId encoding (bits)
 // | reserved |   node_index   | device_type | device_index  | stream_index |
-// | -- 18 -- | ----- 19 ----- | ---- 5 ---- | ----- 7 ----- |              |
+// | -- 21 -- | ----- 19 ----- | ---- 5 ---- | ----- 7 ----- |              |
 // |          |                  DeviceId                    |              |
-// |          | ------------------- 31 --------------------- | ---- 15 ---- |
+// |          | ------------------- 31 --------------------- | ---- 12 ---- |
 // |                               StreamId                                 |
 // | -------------------------------- 64 ---------------------------------- |
 
diff --git a/oneflow/core/graph/task_id.cpp b/oneflow/core/graph/task_id.cpp
index 1cb1bf1db6f..00d3b0b1483 100644
--- a/oneflow/core/graph/task_id.cpp
+++ b/oneflow/core/graph/task_id.cpp
@@ -20,9 +20,9 @@ namespace oneflow {
 
 // TaskId encoding (maybe extended to 128 bits in future)
 // |            rank            | device_type | device_index  |                           |
-// | ----------- 16 ----------- | ---- 5 ---- | ----- 7 ----- |                           |
+// | ----------- 19 ----------- | ---- 5 ---- | ----- 7 ----- |                           |
 // |                        DeviceId                          | stream_index |            |
-// | ------------------------- 31 --------------------------- | ---- 15 ---- |            |
+// | ------------------------- 31 --------------------------- | ---- 12 ---- |            |
 // |                               StreamId                                  | task_index |
 // | -------------------------------- 43 ----------------------------------- | --- 21 --- |
 // |                                      TaskId                                          |
diff --git a/oneflow/ir/include/OneFlow/OneFlowUserOps.td b/oneflow/ir/include/OneFlow/OneFlowUserOps.td
index 05f5ea56bc3..a05cb67c91d 100644
--- a/oneflow/ir/include/OneFlow/OneFlowUserOps.td
+++ b/oneflow/ir/include/OneFlow/OneFlowUserOps.td
@@ -344,20 +344,6 @@ def OneFlow_QuickGeluGradOp : OneFlow_BaseOp<"quick_gelu_grad", [NoMemoryEffect,
   let has_data_type_infer_fn = 1;
 }
 
-def OneFlow_SquareReLUGradOp : OneFlow_BaseOp<"square_relu_grad", [NoMemoryEffect, DeclareOpInterfaceMethods<UserOpCompatibleInterface>]> {
-  let input = (ins
-    OneFlow_Tensor:$x,
-    OneFlow_Tensor:$dy
-  );
-  let output = (outs
-    OneFlow_Tensor:$dx
-  );
-  let has_logical_tensor_desc_infer_fn = 1;
-  let has_physical_tensor_desc_infer_fn = 1;
-  let has_get_sbp_fn = 1;
-  let has_data_type_infer_fn = 1;
-}
-
 def OneFlow_GridSampleOp : OneFlow_BaseOp<"grid_sample", [NoMemoryEffect, DeclareOpInterfaceMethods<UserOpCompatibleInterface>]> {
   let input = (ins
     OneFlow_Tensor:$input,
@@ -10428,19 +10414,6 @@ def OneFlow_QuickGeluOp : OneFlow_BaseOp<"quick_gelu", [NoMemoryEffect, DeclareO
   let has_data_type_infer_fn = 1;
 }
 
-def OneFlow_SquareReLUOp : OneFlow_BaseOp<"square_relu", [NoMemoryEffect, DeclareOpInterfaceMethods<UserOpCompatibleInterface>]> {
-  let input = (ins
-    OneFlow_Tensor:$x
-  );
-  let output = (outs
-    OneFlow_Tensor:$y
-  );
-  let has_logical_tensor_desc_infer_fn = 1;
-  let has_physical_tensor_desc_infer_fn = 1;
-  let has_get_sbp_fn = 1;
-  let has_data_type_infer_fn = 1;
-}
-
 def OneFlow_HardsigmoidOp : OneFlow_BaseOp<"hardsigmoid", [NoMemoryEffect, DeclareOpInterfaceMethods<UserOpCompatibleInterface>]> {
   let input = (ins
     OneFlow_Tensor:$in
diff --git a/oneflow/ir/lib/OneFlow/Passes.cpp b/oneflow/ir/lib/OneFlow/Passes.cpp
index 844a3331a9e..535c18aeabf 100644
--- a/oneflow/ir/lib/OneFlow/Passes.cpp
+++ b/oneflow/ir/lib/OneFlow/Passes.cpp
@@ -944,6 +944,7 @@ struct KernelLaunchPattern : public mlir::OpRewritePattern<oneflow::Job> {
 
     int name_index = 0;
     std::vector<Operation*> current_wrap_ops;
+    op->dump();
     for (auto op_it = ops.begin(); op_it != ops.end(); ++op_it) {
       auto current_op = &(*op_it);
       if (!IsPackagable(current_op)) {
diff --git a/oneflow/user/kernels/activation_kernels.cpp b/oneflow/user/kernels/activation_kernels.cpp
index 3094858cff8..baeab19adaa 100644
--- a/oneflow/user/kernels/activation_kernels.cpp
+++ b/oneflow/user/kernels/activation_kernels.cpp
@@ -282,32 +282,6 @@ REGISTER_USER_KERNEL("quick_gelu_grad")
     })
     .SetIsMatchedHob(BinaryPrimitiveExists(ep::primitive::BinaryOp::kQuickGeluBackwardWithDyX, "dx",
                                            "dy"));
-REGISTER_USER_KERNEL("square_relu")
-    .SetCreateFn([]() {
-      return user_op::NewOpKernel<UnaryPrimitiveKernel>(
-          "y", "x", [](user_op::KernelComputeContext* ctx) {
-            const user_op::TensorDesc* src = ctx->TensorDesc4ArgNameAndIndex("x", 0);
-            const user_op::TensorDesc* dst = ctx->TensorDesc4ArgNameAndIndex("y", 0);
-            return ep::primitive::NewPrimitive<ep::primitive::ElementwiseUnaryFactory>(
-                ctx->device_type(), ep::primitive::UnaryOp::kSquareReLU, src->data_type(),
-                dst->data_type());
-          });
-    })
-    .SetIsMatchedHob(UnaryPrimitiveExists(ep::primitive::UnaryOp::kSquareReLU, "y", "x"));
-
-REGISTER_USER_KERNEL("square_relu_grad")
-    .SetCreateFn([]() {
-      return user_op::NewOpKernel<BinaryPrimitiveKernel>(
-          "dx", "dy", "x", [](user_op::KernelComputeContext* ctx) {
-            const user_op::TensorDesc* src = ctx->TensorDesc4ArgNameAndIndex("dy", 0);
-            const user_op::TensorDesc* dst = ctx->TensorDesc4ArgNameAndIndex("dx", 0);
-            return ep::primitive::NewPrimitive<ep::primitive::BroadcastElementwiseBinaryFactory>(
-                ctx->device_type(), ep::primitive::BinaryOp::kSquareReLUBackwardWithDyX,
-                src->data_type(), dst->data_type(), 1 /*max_num_dims*/);
-          });
-    })
-    .SetIsMatchedHob(BinaryPrimitiveExists(ep::primitive::BinaryOp::kSquareReLUBackwardWithDyX,
-                                           "dx", "dy"));
 
 REGISTER_USER_KERNEL("leaky_relu")
     .SetCreateFn([]() {
diff --git a/oneflow/user/kernels/grouped_matmul_bias.cu b/oneflow/user/kernels/grouped_matmul_bias.cu
index 2022fbec012..c23d9c925b8 100644
--- a/oneflow/user/kernels/grouped_matmul_bias.cu
+++ b/oneflow/user/kernels/grouped_matmul_bias.cu
@@ -190,13 +190,7 @@ class GroupedMatmulBiasKernel final : public user_op::OpKernel, public user_op::
     }
     void* workspace = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0)->mut_dptr();
     for (const auto& group : groups) {
-      for (size_t i = 0; i < group.second.size(); i += kMaxProblemBatch) {
-        std::vector<Buffer<T>> ptrs(
-            {group.second.begin() + i,
-             group.second.begin() + i
-                 + std::min<size_t>(group.second.size() - i, kMaxProblemBatch)});
-        ApplyGroup<T>(group.first, ptrs, has_biases, workspace, ctx->stream());
-      }
+      ApplyGroup<T>(group.first, group.second, has_biases, workspace, ctx->stream());
     }
   }
   bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
diff --git a/oneflow/user/ops/square_relu_op.cpp b/oneflow/user/ops/square_relu_op.cpp
deleted file mode 100644
index 7f64c138888..00000000000
--- a/oneflow/user/ops/square_relu_op.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/framework/op_generated.h"
-
-namespace oneflow {
-
-/*static*/ Maybe<void> SquareReLUOp::InferLogicalTensorDesc(user_op::InferContext* ctx) {
-  ctx->SetOutputShape("y", 0, ctx->InputShape("x", 0));
-  return Maybe<void>::Ok();
-}
-
-/*static*/ Maybe<void> SquareReLUOp::InferPhysicalTensorDesc(user_op::InferContext* ctx) {
-  return InferLogicalTensorDesc(ctx);
-}
-
-/*static*/ Maybe<void> SquareReLUOp::InferDataType(user_op::InferContext* ctx) {
-  ctx->SetOutputDType("y", 0, ctx->InputDType("x", 0));
-  return Maybe<void>::Ok();
-}
-
-/*static*/ Maybe<void> SquareReLUOp::GetSbp(user_op::SbpContext* ctx) {
-  const user_op::TensorDesc& in_tensor = ctx->LogicalTensorDesc4InputArgNameAndIndex("x", 0);
-  FOR_RANGE(int64_t, i, 0, in_tensor.shape().NumAxes()) {
-    ctx->NewBuilder().Split(user_op::OpArg("x", 0), i).Split(user_op::OpArg("y", 0), i).Build();
-  }
-  return Maybe<void>::Ok();
-}
-
-/*static*/ Maybe<void> SquareReLUGradOp::InferLogicalTensorDesc(user_op::InferContext* ctx) {
-  const Shape& x_shape = ctx->InputShape("x", 0);
-  const Shape& dy_shape = ctx->InputShape("dy", 0);
-  CHECK_OR_RETURN(dy_shape == x_shape)
-      << "InferTensorDesc failed (" << ctx->op_name() << "). Expected x shape "
-      << x_shape.ToString() << " to be equal to dy shape " << dy_shape.ToString();
-  ctx->SetOutputShape("dx", 0, dy_shape);
-  return Maybe<void>::Ok();
-}
-
-/*static*/ Maybe<void> SquareReLUGradOp::InferPhysicalTensorDesc(user_op::InferContext* ctx) {
-  return InferLogicalTensorDesc(ctx);
-}
-
-/*static*/ Maybe<void> SquareReLUGradOp::InferDataType(user_op::InferContext* ctx) {
-  CHECK_EQ_OR_RETURN(ctx->InputDType("x", 0), ctx->InputDType("dy", 0))
-      << "InferDataType Failed. Expected " << DataType_Name(ctx->InputDType("dy", 0))
-      << ", but got " << DataType_Name(ctx->InputDType("x", 0));
-  ctx->SetOutputDType("dx", 0, ctx->InputDType("x", 0));
-  return Maybe<void>::Ok();
-}
-
-/*static*/ Maybe<void> SquareReLUGradOp::GetSbp(user_op::SbpContext* ctx) {
-  const user_op::TensorDesc& x_tensor = ctx->LogicalTensorDesc4InputArgNameAndIndex("x", 0);
-  FOR_RANGE(int64_t, i, 0, x_tensor.shape().NumAxes()) {
-    ctx->NewBuilder()
-        .Split(user_op::OpArg("x", 0), i)
-        .Split(user_op::OpArg("dy", 0), i)
-        .Split(user_op::OpArg("dx", 0), i)
-        .Build();
-  }
-  return Maybe<void>::Ok();
-}
-
-}  // namespace oneflow
diff --git a/python/oneflow/__init__.py b/python/oneflow/__init__.py
index 6dc7520a410..85faa262056 100644
--- a/python/oneflow/__init__.py
+++ b/python/oneflow/__init__.py
@@ -130,7 +130,6 @@ def use_deterministic_algorithms(mode, *, warn_only=False):
 from oneflow._C import quantile
 from oneflow._C import gelu_with_approximate as gelu
 from oneflow._C import quick_gelu
-from oneflow._C import square_relu
 from oneflow._C import mish
 from oneflow._C import repeat
 from oneflow._C import repeat_interleave
diff --git a/python/oneflow/_dynamo/__init__.py b/python/oneflow/_dynamo/__init__.py
deleted file mode 100644
index abc1eea891a..00000000000
--- a/python/oneflow/_dynamo/__init__.py
+++ /dev/null
@@ -1,33 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-import warnings
-
-# Reference: https://github.com/pytorch/pytorch/blob/v2.0.1/torch/_dynamo/__init__.py
-__all__ = [
-    "allow_in_graph",
-]
-
-
-def allow_in_graph(fn):
-    """
-    """
-    if isinstance(fn, (list, tuple)):
-        return [allow_in_graph(x) for x in fn]
-    assert callable(fn), "allow_in_graph expects a callable"
-    warnings.warn(
-        "The oneflow._dynamo.allow_in_graph interface is just to align the torch._dynamo.allow_in_graph interface and has no practical significance."
-    )
-    return fn
diff --git a/python/oneflow/framework/args_tree.py b/python/oneflow/framework/args_tree.py
index 50f4e6a7fcb..afd38c9907b 100644
--- a/python/oneflow/framework/args_tree.py
+++ b/python/oneflow/framework/args_tree.py
@@ -41,15 +41,12 @@ class NamedArg(object):
     named_input = NamedArg([NamedArg(1), NamedArg({key: NamedArg("value")})])
     """
 
-    def __init__(
-        self, prefix="", name=None, global_index=0, tensor_type=Tensor
-    ) -> None:
+    def __init__(self, prefix="", name=None, global_index=0) -> None:
         self._name = name if name is not None else str(global_index)
         self._prefix = prefix
         self._global_index = global_index
         self._is_value_set = False
         self._value = None
-        self._tensor_type = tensor_type
 
     def prefix(self):
         return self._prefix
@@ -89,28 +86,21 @@ def __repr__(self):
             repr_str += "LIST"
         elif _is_raw_type(self._value, dict) or _is_raw_type(self._value, OrderedDict):
             repr_str += "DICT"
-        elif isinstance(self._value, self._tensor_type):
+        elif isinstance(self._value, Tensor):
             repr_str += "TENSOR"
         elif self._value is None:
             repr_str += "NONE"
         else:
             repr_str += "OPAQUE"
-
-        if isinstance(self._value, self._tensor_type):
-            repr_str += (
-                ", value: tensor("
-                + str(self._value.shape)
-                + ", "
-                + str(self._value.dtype)
-                + ")"
-            )
+        if isinstance(self._value, Tensor):
+            repr_str += ", value: " + self._value._meta_repr()
         elif (
             _is_raw_type(self._value, dict)
             or _is_raw_type(self._value, OrderedDict)
             or _is_raw_type(self._value, list)
             or _is_raw_type(self._value, tuple)
         ):
-            repr_str += ", value: " + repr(self._value)
+            pass
         else:
             repr_str += ", value: " + repr(self._value)
         repr_str += ")"
@@ -124,7 +114,6 @@ def __init__(
         gen_name: bool = False,
         root_prefix: str = "",
         root_name: str = None,
-        tensor_type=Tensor,
     ) -> None:
 
         self._io_args = io_args
@@ -133,7 +122,6 @@ def __init__(
         self._root_name = root_name
         self._named_io_args = None
         self._next_global_index = 0
-        self._tensor_type = tensor_type
 
         if self._gen_name:
             self._named_io_args = self._construct_named_io_args(
@@ -190,7 +178,7 @@ def iter_named_nodes(self):
             yield (named_node.prefix() + "_" + named_node.name(), named_node)
 
     def _construct_named_io_args(self, value, prefix: str, name: str) -> NamedArg:
-        arg = NamedArg(prefix, name, self._next_global_index, self._tensor_type)
+        arg = NamedArg(prefix, name, self._next_global_index)
         self._next_global_index += 1
 
         if _is_raw_type(value, list) or _is_raw_type(value, tuple):
@@ -231,7 +219,7 @@ def map_tuple_leaf(self, map_function: Callable):
         stack = []
 
         # Cases handled: tuple(tensor, ...), such as input args.
-        if len(self._io_args) > 0 and isinstance(self._io_args[0], self._tensor_type):
+        if len(self._io_args) > 0 and isinstance(self._io_args[0], Tensor):
             for i in self._io_args:
                 mapped_value = map_function(i)
                 stack.append(mapped_value)
@@ -245,7 +233,7 @@ def map_tuple_leaf(self, map_function: Callable):
         elif (
             len(self._io_args) > 0
             and isinstance(self._io_args[0], (tuple, list))
-            and all(isinstance(arg, self._tensor_type) for arg in self._io_args[0])
+            and all(isinstance(arg, Tensor) for arg in self._io_args[0])
         ):
             for i in self._io_args[0]:
                 mapped_value = map_function(i)
@@ -295,9 +283,3 @@ def _execute_mapping(self, value, map_function):
             mapped_value = map_function(value)
 
         return mapped_value
-
-    def __repr__(self):
-        if self._named_io_args:
-            return self._named_io_args.__repr__()
-        else:
-            return str(self.__class__)
diff --git a/python/oneflow/framework/docstr/activation.py b/python/oneflow/framework/docstr/activation.py
index 402b0f56c6e..dc1e8f48391 100644
--- a/python/oneflow/framework/docstr/activation.py
+++ b/python/oneflow/framework/docstr/activation.py
@@ -136,28 +136,6 @@
     """,
 )
 
-add_docstr(
-    oneflow._C.square_relu,
-    r"""
-    square_relu(x: Tensor) -> Tensor 
-
-    Applies the relu^2 activation introduced in https://arxiv.org/abs/2109.08668v2
-
-    .. math::
-        \\text{ReLU}(x) = \\max(0, x) * \\max(0, x)
-
-    Args:
-        input (oneflow.Tensor): Input Tensor
-
-    Returns:
-        oneflow.Tensor: A Tensor has same shape as the input.
-
-    See    
-    :class:`~oneflow.nn.SquareReLU` for more details.
- 
-    """,
-)
-
 add_docstr(
     oneflow._C.softmax,
     r"""
diff --git a/python/oneflow/nn/__init__.py b/python/oneflow/nn/__init__.py
index b4465a82ad0..864a6e9bd8d 100644
--- a/python/oneflow/nn/__init__.py
+++ b/python/oneflow/nn/__init__.py
@@ -20,7 +20,6 @@
     CELU,
     GELU,
     QuickGELU,
-    SquareReLU,
     GLU,
     Hardsigmoid,
     Hardshrink,
diff --git a/python/oneflow/nn/functional/__init__.py b/python/oneflow/nn/functional/__init__.py
index 8019945c366..21afe67ab6d 100644
--- a/python/oneflow/nn/functional/__init__.py
+++ b/python/oneflow/nn/functional/__init__.py
@@ -40,7 +40,6 @@
 from oneflow._C import max_unpool3d
 from oneflow._C import cosine_similarity, pairwise_distance
 from oneflow._C import relu
-from oneflow._C import square_relu
 from oneflow._C import hardtanh
 from oneflow._C import hardsigmoid
 from oneflow._C import hardshrink
diff --git a/python/oneflow/nn/graph/cache.py b/python/oneflow/nn/graph/cache.py
index 059cece917a..7870e224656 100644
--- a/python/oneflow/nn/graph/cache.py
+++ b/python/oneflow/nn/graph/cache.py
@@ -20,7 +20,7 @@
 
 from oneflow.framework.args_tree import ArgsTree
 from oneflow.framework.tensor import Tensor
-import oneflow
+import oneflow as flow
 
 
 class LRUCache(object):
@@ -134,28 +134,6 @@ def runtime_state_dict(
             destination[state_dict["graph_name"]] = state_dict
         return destination
 
-    @staticmethod
-    def runtime_state_dict_to(
-        state_dict: Union[
-            Dict[str, Union[Dict[str, Tensor], str]],
-            Dict[str, Dict[str, Union[Dict[str, Tensor], str]]],
-        ],
-        device: str,
-    ) -> Union[
-        Dict[str, Union[Dict[str, Tensor], str]],
-        Dict[str, Dict[str, Union[Dict[str, Tensor], str]]],
-    ]:
-        destination = OrderedDict()
-        destination._metadata = OrderedDict()
-        for (key, sub_state_dict) in state_dict.items():
-            dest_sub_state_dict = oneflow.nn.Graph.runtime_state_dict_to(
-                sub_state_dict, device
-            )
-            dest_sub_state_dict["cache_order"] = sub_state_dict["cache_order"]
-            dest_sub_state_dict["cache_key"] = sub_state_dict["cache_key"]
-            destination[key] = dest_sub_state_dict
-        return destination
-
     def _init_and_get_a_graph_in_cache(self, cache_key):
         self._base_graph._print(
             0,
diff --git a/python/oneflow/nn/graph/graph.py b/python/oneflow/nn/graph/graph.py
index 8fdba595fe9..2b6aca3627c 100644
--- a/python/oneflow/nn/graph/graph.py
+++ b/python/oneflow/nn/graph/graph.py
@@ -52,9 +52,6 @@
     GraphIR,
     seq_to_func_return,
     sys_exc_error_msg,
-    _rsd_sub_destination_to,
-    _job_to,
-    _plan_to,
 )
 from oneflow.framework.args_tree import ArgsTree
 from oneflow.nn.modules.module import Module
@@ -1072,35 +1069,34 @@ def _fill_sub_destination(dest_dict, name_list, tensor_tuple):
             assert len(tensor_tuple) == len(name_list)
             for name_idx in range(len(name_list)):
                 tensor_item = tensor_tuple[name_idx]
-                device_str = ":".join(
-                    (tensor_item.device.type, str(tensor_item.device.index))
-                )
-                dest_dict[name_list[name_idx]] = (tensor_item, device_str)
+                dest_dict[name_list[name_idx]] = (tensor_item, tensor_item.device.type)
 
         # This is original outputs is needed to build output buffer.
         tuple_idx = -1
 
-        def gen_index_in_tuple(item):
+        def gen_index_in_tuple(eager_out):
             nonlocal tuple_idx
-            if isinstance(item, Tensor):
-                tuple_idx += 1
-                return "_OFTPI" + str(tuple_idx)  # oneflow tuple index
-            else:
-                return item
+            tuple_idx += 1
+            return "_OFTPI" + str(tuple_idx)  # oneflow tuple index
 
         inputs_sub_destination = OrderedDict()
         _fill_sub_destination(
             inputs_sub_destination, self._input_op_names, self._inputs_tensor_tuple
         )
 
-        _eager_inputs_args, _eager_inputs_kwargs = self.__map_io_lite(
-            gen_index_in_tuple, *self.inputs_original[0], **self.inputs_original[1],
+        _eager_inputs_args, _eager_inputs_kwargs = self.__map_io(
+            "input",
+            gen_index_in_tuple,
+            *self.inputs_original[0],
+            **self.inputs_original[1],
         )
         destination["inputs"] = inputs_sub_destination
         destination["inputs_original"] = (_eager_inputs_args, _eager_inputs_kwargs)
 
         tuple_idx = -1
-        _eager_outputs, _ = self.__map_io_lite(gen_index_in_tuple, *self._eager_outputs)
+        _eager_outputs, _ = self.__map_io(
+            "output", gen_index_in_tuple, *self._eager_outputs
+        )
         destination["outputs_original"] = _eager_outputs
         assert len(self._outputs_tensor_tuple) == tuple_idx + 1
         outputs_sub_destination = OrderedDict()
@@ -1150,7 +1146,7 @@ def load_runtime_state_dict(
             Dict[str, Dict[str, Union[Dict[str, Tensor], str]]],
         ],
         *,
-        warmup_with_run: bool = True,
+        warmup_with_run: bool = False,
     ) -> None:
         if self._run_with_cache == True:
             return self._dynamic_input_graph_cache.load_runtime_state_dict(
@@ -1297,7 +1293,6 @@ def get_tensor_in_tuple(tensor_tuple, map_item):
             self.__run(
                 *_eager_inputs_args, **_eager_inputs_kwargs
             )  # pre-run to warm up
-            oneflow._oneflow_internal.eager.Sync()
         build_graph_end = time.perf_counter()
         self.__print(
             0,
@@ -1309,53 +1304,6 @@ def get_tensor_in_tuple(tensor_tuple, map_item):
             + "\n",
         )
 
-    @staticmethod
-    def runtime_state_dict_to(
-        state_dict: Union[
-            Dict[str, Union[Dict[str, Tensor], str]],
-            Dict[str, Dict[str, Union[Dict[str, Tensor], str]]],
-        ],
-        device: str,
-    ) -> Union[
-        Dict[str, Union[Dict[str, Tensor], str]],
-        Dict[str, Dict[str, Union[Dict[str, Tensor], str]]],
-    ]:
-        if "job_id" not in state_dict:
-            from oneflow.nn.graph.cache import GraphCache
-
-            return GraphCache.runtime_state_dict_to(state_dict, device)
-
-        dest_device = oneflow.device(device)
-        assert dest_device.type == "cuda", "device must be cuda."
-
-        destination = OrderedDict()
-        destination._metadata = OrderedDict()
-        destination["oneflow_version"] = state_dict["oneflow_version"]
-        destination["graph_name"] = state_dict["graph_name"]
-        destination["job_id"] = state_dict["job_id"]
-        destination["inputs"] = _rsd_sub_destination_to(state_dict["inputs"], device)
-        destination["inputs_original"] = state_dict["inputs_original"]
-        destination["outputs"] = _rsd_sub_destination_to(state_dict["outputs"], device)
-        destination["outputs_original"] = state_dict["outputs_original"]
-        destination["oneflow_with_eager_tensor"] = state_dict[
-            "oneflow_with_eager_tensor"
-        ]
-        if "states" in state_dict:
-            destination["states"] = _rsd_sub_destination_to(
-                state_dict["states"], device
-            )
-        destination["exe_plan"] = _plan_to(state_dict["exe_plan"], dest_device)
-        if "forward_graph" in state_dict:
-            forward_graph = deepcopy(state_dict["forward_graph"])
-            _job_to(forward_graph, dest_device)
-            destination["forward_graph"] = forward_graph
-        if "compile_graph" in state_dict:
-            compile_graph = deepcopy(state_dict["compile_graph"])
-            _job_to(compile_graph, dest_device)
-            destination["compile_graph"] = compile_graph
-        destination["id_state"] = state_dict["id_state"]
-        return destination
-
     def build_graph(self, *args, **kwargs):
         # Build graph
         try:
@@ -1800,13 +1748,14 @@ def __build_io(self, io_type, build_func, *args, **kwargs):
         args_repr = []
         tensor2op_name = {}
 
-        def build_tensor_or_any(tensor, name, repr_str):
+        def build_tensor_or_none(tensor, name, repr_str):
+            assert tensor is None or (isinstance(tensor, Tensor))
             if isinstance(tensor, Tensor):
                 build_arg = build_func(name, tensor)
                 op_names.append(name)
                 tensor2op_name[build_arg] = name
             else:
-                build_arg = tensor
+                build_arg = None
 
             args_repr.append(repr_str)
             self.__print(0, 1, repr_str)
@@ -1822,13 +1771,18 @@ def leaf_arg_fn(arg):
                 arg_repr = self.__io_item_check_and_gen_repr(
                     arg.value(), Tensor, io_type, name
                 )
-                build_arg = build_tensor_or_any(arg.value(), name, arg_repr)
+                build_arg = build_tensor_or_none(arg.value(), name, arg_repr)
                 return build_arg
+            elif arg.value() is None:
+                arg_repr = self.__io_item_check_and_gen_repr(
+                    arg.value(), None, io_type, name
+                )
+                build_arg = build_tensor_or_none(arg.value(), name, arg_repr)
             else:  # Opaque
+                # Error
                 arg_repr = self.__io_item_check_and_gen_repr(
                     arg.value(), None, io_type, name
                 )
-                build_arg = build_tensor_or_any(arg.value(), name, arg_repr)
 
         out = args_tree.map_leaf(leaf_arg_fn)
         build_args = out[0]
@@ -1838,7 +1792,7 @@ def leaf_arg_fn(arg):
 
     def __io_item_check_and_gen_repr(self, item, expect_type, io_type, name):
         assert io_type in ("input", "output")
-        if expect_type is None:
+        if expect_type is None and item is None:
             repr_str = (
                 "[WARNING]("
                 + io_type.upper()
@@ -1848,7 +1802,6 @@ def __io_item_check_and_gen_repr(self, item, expect_type, io_type, name):
                 + str(type(item))
                 + ")"
             )
-            self.__print(1, 0, repr_str)
             return repr_str
         elif expect_type is not None and isinstance(item, expect_type):
             if isinstance(item, Tensor):
@@ -1878,21 +1831,27 @@ def __io_item_check_and_gen_repr(self, item, expect_type, io_type, name):
     def __map_io(self, io_type, func, *args, **kwargs):
         assert io_type in ("input", "output")
 
-        def mapping_tensor_or_any(tensor):
+        def mapping_tensor_or_none(tensor):
+            assert tensor is None or (isinstance(tensor, Tensor))
             if isinstance(tensor, Tensor):
                 mapped_arg = func(tensor)
             else:
-                mapped_arg = tensor
+                mapped_arg = None
             return mapped_arg
 
         def leaf_arg_fn(arg):
             arg_value = arg.value()
-            return mapping_tensor_or_any(arg_value)
+            if isinstance(arg_value, Tensor) or arg_value is None:
+                return mapping_tensor_or_none(arg_value)
+            else:
+                self.__io_item_check(
+                    arg_value, None, io_type, arg.prefix() + "_" + arg.name(),
+                )
 
         # NOTE(lixiang): Reduce the overhead of traversal and parsing of io args.
         if self._is_simple_tuple_output or self._is_simple_tuple_input:
             args_tree = ArgsTree(args, False)
-            out = args_tree.map_tuple_leaf(mapping_tensor_or_any)
+            out = args_tree.map_tuple_leaf(mapping_tensor_or_none)
             return out, kwargs
 
         args_tree = ArgsTree(
diff --git a/python/oneflow/nn/graph/util.py b/python/oneflow/nn/graph/util.py
index f60ecac07f3..f2ad36456a8 100644
--- a/python/oneflow/nn/graph/util.py
+++ b/python/oneflow/nn/graph/util.py
@@ -16,15 +16,12 @@
 import sys
 from string import Template
 from typing import Callable, Dict, Union, List, Tuple, Optional
-from collections import OrderedDict
 
+import google.protobuf as protobuf
 from google.protobuf import text_format
-from google.protobuf.message import Message
 
 import oneflow
 import oneflow.core.job.job_pb2 as job_pb
-import oneflow.core.job.plan_pb2 as plan_pb
-import oneflow.core.common.device_type_pb2 as device_type
 import oneflow.core.operator.op_conf_pb2 as op_conf_util
 from oneflow.framework.tensor import Tensor
 
@@ -270,7 +267,11 @@ def _op_signature(
     return True, op_str
 
 
-def operators_repr(ops: Message, graph_ir: GraphIR, show_op_loc: bool,) -> List[str]:
+def operators_repr(
+    ops: protobuf.pyext._message.RepeatedCompositeContainer,
+    graph_ir: GraphIR,
+    show_op_loc: bool,
+) -> List[str]:
     r"""Generate operators' string representation of this module
     """
     graph_proto = graph_ir._graph_proto
@@ -311,119 +312,3 @@ def seq_to_func_return(seq, need_unpack=False):
     if need_unpack:
         return seq[0]
     return seq
-
-
-def _rsd_sub_destination_to(origin_dict, dest_device_str):
-    dest_dict = OrderedDict()
-    for k, v in origin_dict.items():
-        tensor_item, device_str = v
-        dest_dict[k] = (
-            tensor_item.to(device=oneflow.device(dest_device_str), copy=True),
-            dest_device_str,
-        )
-    return dest_dict
-
-
-def _parallel_conf_to(parallel_conf, dest_device):
-    if parallel_conf.device_tag == "cuda":
-        assert len(parallel_conf.device_name) == 1
-        parallel_conf.device_name[0] = "@0:" + str(dest_device.index)
-
-
-def _mem_case_to(mem_case, dest_device):
-    if mem_case.device_type == device_type.DeviceType.kCUDA:
-        mem_case.device_id = dest_device.index
-    if (
-        mem_case.HasField("pinned_device_type")
-        and mem_case.pinned_device_type == device_type.DeviceType.kCUDA
-    ):
-        mem_case.pinned_device_id = dest_device.index
-
-
-def _job_to(job, dest_device):
-    for pg in job.placement.placement_group:
-        _parallel_conf_to(pg.parallel_conf, dest_device)
-    for bpg in job.placement.blob_placement_group:
-        _parallel_conf_to(bpg.parallel_conf, dest_device)
-
-
-def _modify_bits(original_num, k, j, new_num):
-    if k > j:
-        return original_num
-    mask = ((1 << (j - k + 1)) - 1) << k
-    cleared_num = original_num & ~mask
-    modified_num = cleared_num | ((new_num & ((1 << (j - k + 1)) - 1)) << k)
-    return modified_num
-
-
-def _get_bits(original_num, k, j):
-    mask = ((1 << (j - k + 1)) - 1) << k
-    cleared_num = (original_num & mask) >> k
-
-    return cleared_num
-
-
-def _task_id_to(task_id, dest_device):
-    if _get_bits(task_id, 43, 48) == 2:
-        new_id = _modify_bits(task_id, 36, 43, dest_device.index)
-
-        return new_id
-    else:
-        return task_id
-
-
-def _thrd_id_to(thrd_id, dest_device):
-    if _get_bits(thrd_id, 22, 27) == 2:
-        new_id = _modify_bits(thrd_id, 15, 22, dest_device.index)
-        return new_id
-    else:
-        return thrd_id
-
-
-def _plan_to(plan_str, dest_device):
-    plan = plan_pb.Plan()
-    plan.ParseFromString(plan_str)
-    for task in plan.task:
-        task.task_id = _task_id_to(task.task_id, dest_device)
-        task.thrd_id = _thrd_id_to(task.thrd_id, dest_device)
-        for node in task.exec_sequence.exec_node:
-            _parallel_conf_to(
-                node.kernel_conf.op_attribute.parallel_conf_signature.op_parallel_conf,
-                dest_device,
-            )
-        for name, regst in task.produced_regst_desc.items():
-            regst.producer_task_id = _task_id_to(regst.producer_task_id, dest_device)
-            for c_task_id_idx in range(len(regst.consumer_task_id)):
-                regst.consumer_task_id[c_task_id_idx] = _task_id_to(
-                    regst.consumer_task_id[c_task_id_idx], dest_device
-                )
-            _mem_case_to(regst.mem_case, dest_device)
-    for mem_block in plan.block_chunk_list.mem_block:
-        _mem_case_to(mem_block.mem_case, dest_device)
-        mem_block.thrd_id_hint = _thrd_id_to(mem_block.thrd_id_hint, dest_device)
-    for chunk in plan.block_chunk_list.chunk:
-        _mem_case_to(chunk.mem_case, dest_device)
-
-    new_ctrl_regst_desc_id2producer_task_id = {}
-    for (
-        regst_desc_id,
-        producer_task_id,
-    ) in plan.ctrl_regst_desc_info.ctrl_regst_desc_id2producer_task_id.items():
-        new_ctrl_regst_desc_id2producer_task_id[regst_desc_id] = _task_id_to(
-            producer_task_id, dest_device
-        )
-    for (
-        regst_desc_id,
-        producer_task_id,
-    ) in new_ctrl_regst_desc_id2producer_task_id.items():
-        plan.ctrl_regst_desc_info.ctrl_regst_desc_id2producer_task_id[
-            regst_desc_id
-        ] = producer_task_id
-
-    for job_id, op_attr_tab in plan.job_id2op_attribute_ref_table.items():
-        for _, op_attr in op_attr_tab.op_name2op_attribute.items():
-            _parallel_conf_to(
-                op_attr.parallel_conf_signature.op_parallel_conf, dest_device
-            )
-
-    return plan.SerializeToString()
diff --git a/python/oneflow/nn/modules/activation.py b/python/oneflow/nn/modules/activation.py
index ad410bb8fcf..1803958d773 100644
--- a/python/oneflow/nn/modules/activation.py
+++ b/python/oneflow/nn/modules/activation.py
@@ -407,45 +407,6 @@ def forward(self, x):
         return flow._C.quick_gelu(x)
 
 
-class SquareReLU(Module):
-    """
-    SquareReLU() -> Tensor
-
-    Applies the relu^2 activation introduced in https://arxiv.org/abs/2109.08668v2
-
-    .. math::
-        :math:`\\text{SquareReLU}(x) = \\max(0, x) * \\max(0, x)`
-
-    Args:
-        input (oneflow.Tensor): Input Tensor
-
-    Returns:
-        oneflow.Tensor: A Tensor has same shape as the input.
-        
-    For example:
-
-    .. code-block:: python
-
-        >>> import numpy as np
-        >>> import oneflow as flow
-        
-        >>> x = np.array([-0.5, 0, 0.5]).astype(np.float32)
-        >>> input = flow.Tensor(x)
-        >>> square_relu = flow.nn.SquareReLU()
-
-        >>> out = square_relu(input)
-        >>> out
-        tensor([0.0000, 0.0000, 0.2500], dtype=oneflow.float32)
-
-    """
-
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, x):
-        return flow._C.square_relu(x)
-
-
 class Sigmoid(Module):
     """Applies the element-wise function:
 
diff --git a/python/oneflow/nn/modules/module.py b/python/oneflow/nn/modules/module.py
index 512cceb37a6..3bdf4d63ca7 100644
--- a/python/oneflow/nn/modules/module.py
+++ b/python/oneflow/nn/modules/module.py
@@ -1268,8 +1268,6 @@ def register_forward_hook(self, hook: Callable[..., None]):
         return handle
 
     def _apply(self, fn):
-        if not hasattr(self, "cpg"):
-            self.cpg = None
         if self.cpg is not None:
             self.cpg = None
             warnings.warn(
diff --git a/python/oneflow/test/expensive/test_graph_multi_graph_v2.py b/python/oneflow/test/expensive/test_graph_multi_graph_v2.py
index a4563337630..360ebc8bb90 100644
--- a/python/oneflow/test/expensive/test_graph_multi_graph_v2.py
+++ b/python/oneflow/test/expensive/test_graph_multi_graph_v2.py
@@ -249,7 +249,7 @@ def build(self, x):
 
 @_with_new_session
 def _test_linear_multi_graph_load(
-    return_dict, device, with_reshape, state_dict, with_new_input
+    return_dict, device, with_reshape, state_dict,
 ):
     linear = flow.nn.Linear(3, 8, False)
     linear = linear.to(device)
@@ -320,26 +320,25 @@ def build(self, x):
     test_case1 = np.array_equal(of_lazy_out1.numpy(), of_eager_out1.numpy())
     return_dict["load1"] = test_case1
 
-    if with_new_input:
-        # The following section is for testing the new input shape after completing the load.
-        input_arr2 = np.array(
-            [
-                [-0.94630778, -0.83378579, -0.87060891],
-                [2.0289922, -0.28708987, -2.18369248],
-                [0.08086036, -1.81075924, 1.20752494],
-            ],
-            dtype=np.float32,
-        )
-        x2 = flow.tensor(input_arr2, device=device)
-        of_lazy_out2 = linear_g(x2)
-        of_eager_out2 = linear_reshape(x2)
-        test_case2 = np.array_equal(of_lazy_out2.numpy(), of_eager_out2.numpy())
-        return_dict["load2"] = test_case2
+    # The following section is for testing the new input shape after completing the load.
+    input_arr2 = np.array(
+        [
+            [-0.94630778, -0.83378579, -0.87060891],
+            [2.0289922, -0.28708987, -2.18369248],
+            [0.08086036, -1.81075924, 1.20752494],
+        ],
+        dtype=np.float32,
+    )
+    x2 = flow.tensor(input_arr2, device=device)
+    of_lazy_out2 = linear_g(x2)
+    of_eager_out2 = linear_reshape(x2)
+    test_case2 = np.array_equal(of_lazy_out2.numpy(), of_eager_out2.numpy())
+    return_dict["load2"] = test_case2
 
 
 def _graph_save(return_dict, filename, with_eager):
     state_dict = _test_linear_multi_graph_save(
-        return_dict, flow.device("cuda:0"), True, with_eager,
+        return_dict, flow.device("cuda"), True, with_eager,
     )
     print(
         f"state_dict(with_eager={with_eager}) tensors size ",
@@ -353,19 +352,7 @@ def _graph_load(return_dict, filename):
     state_dict_loaded = flow.load(filename)
     # load with nn.Graph
     _test_linear_multi_graph_load(
-        return_dict, flow.device("cuda"), True, state_dict_loaded, True
-    )
-    print("====> load process done")
-
-
-def _graph_load_to_another_device(return_dict, filename):
-    state_dict_loaded = flow.load(filename)
-    new_state_dict = flow.nn.Graph.runtime_state_dict_to(
-        state_dict_loaded, flow.device("cuda:1")
-    )
-    # load with nn.Graph
-    _test_linear_multi_graph_load(
-        return_dict, flow.device("cuda:1"), True, new_state_dict, False
+        return_dict, flow.device("cuda"), True, state_dict_loaded,
     )
     print("====> load process done")
 
@@ -395,33 +382,6 @@ def _test_linear_multi_graph_save_load_gpu(test_case, with_eager):
             test_case.assertTrue(check_value, key + " failed.")
 
 
-def _test_load_to_another_device(test_case, with_eager):
-    # A graph runtime state dict
-    with tempfile.NamedTemporaryFile() as f:
-        # Save a graph
-        manager = multiprocessing.Manager()
-        return_dict = manager.dict()
-        save_p = multiprocessing.get_context("spawn").Process(
-            target=_graph_save, args=(return_dict, f.name, with_eager),
-        )
-        save_p.start()
-        save_p.join()
-        print(save_p)
-
-        # Resume a graph from a graph runtime state dict
-        load_p = multiprocessing.get_context("spawn").Process(
-            target=_graph_load_to_another_device, args=(return_dict, f.name)
-        )
-        load_p.start()
-        load_p.join()
-        print(load_p)
-
-        # test_case can't be passed into sub process, so we check with return_dict.
-        # Reference: https://stackoverflow.com/questions/52225003/writing-to-multiple-files-using-multiprocessing-error-typeerror-cannot-seria
-        for (key, check_value) in return_dict.items():
-            test_case.assertTrue(check_value, key + " failed.")
-
-
 @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
 @flow.unittest.skip_unless_1n1d()
 class TestLinearMultiGraph(oneflow.unittest.TestCase):
@@ -437,9 +397,6 @@ def test_linear_multi_graph_save_load_gpu_with_share(test_case):
     def test_linear_multi_graph_save_load_gpu_with_share_without_eager(test_case):
         _test_linear_multi_graph_save_load_gpu(test_case, False)
 
-    def test_load_to_another_device(test_case):
-        _test_load_to_another_device(test_case, False)
-
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/oneflow/test/modules/test_square_relu.py b/python/oneflow/test/modules/test_square_relu.py
deleted file mode 100644
index 799ab272316..00000000000
--- a/python/oneflow/test/modules/test_square_relu.py
+++ /dev/null
@@ -1,67 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-
-import unittest
-from collections import OrderedDict
-
-import numpy as np
-from oneflow.test_utils.test_util import GenArgList
-
-import oneflow as flow
-import oneflow.unittest
-import torch
-
-
-class SquareReLUActivation(torch.nn.Module):
-    """
-    Applies the relu^2 activation introduced in https://arxiv.org/abs/2109.08668v2
-    """
-
-    def forward(self, input):
-        relu_applied = torch.nn.functional.relu(input)
-        squared = torch.square(relu_applied)
-        return squared
-
-
-def _test_square_relu(test_case, device):
-    torch_square_relu = SquareReLUActivation()
-    x = np.random.randn(2, 4, 3)
-    torch_x = torch.tensor(x, requires_grad=True, device=torch.device(device))
-    oneflow_x = flow.tensor(x, requires_grad=True, device=flow.device(device))
-    torch_y = torch_square_relu(torch_x)
-    oneflow_y = flow._C.square_relu(oneflow_x)
-    test_case.assertTrue(np.allclose(torch_y.detach().cpu().numpy(), oneflow_y.numpy()))
-    torch_y_sum = torch_y.sum()
-    torch_y_sum.backward()
-    oneflow_y_sum = oneflow_y.sum()
-    oneflow_y_sum.backward()
-    test_case.assertTrue(
-        np.allclose(torch_x.grad.cpu().numpy(), oneflow_x.grad.numpy())
-    )
-
-
-@flow.unittest.skip_unless_1n1d()
-class TestModule(flow.unittest.TestCase):
-    def test_square_relu(test_case):
-        arg_dict = OrderedDict()
-        arg_dict["test_fun"] = [_test_square_relu]
-        arg_dict["device"] = ["cpu", "cuda"]
-        for arg in GenArgList(arg_dict):
-            arg[0](test_case, *arg[1:])
-
-
-if __name__ == "__main__":
-    unittest.main()