From 34376cc8d2a3e2b4f957932afeb99ea4b8d96c6d Mon Sep 17 00:00:00 2001 From: galargh Date: Mon, 14 Mar 2022 16:21:36 +0100 Subject: [PATCH 1/9] feat: add flags to mirrorzim.sh and extract add_website_to_ipfs.sh out of it --- mirrorzim.sh | 76 ++++++++++++++++++++-------- tools/add_website_to_ipfs.sh | 36 +++++++++++++ tools/find_original_main_page_url.sh | 32 ------------ 3 files changed, 90 insertions(+), 54 deletions(-) create mode 100755 tools/add_website_to_ipfs.sh delete mode 100755 tools/find_original_main_page_url.sh diff --git a/mirrorzim.sh b/mirrorzim.sh index a80c6da..633beac 100755 --- a/mirrorzim.sh +++ b/mirrorzim.sh @@ -11,19 +11,25 @@ usage() { echo "" echo "SYNOPSIS" echo " $0 --languagecode= --wikitype=" + echo " [--tag=]" + echo " [--edition=]" echo " [--hostingdnsdomain=]" echo " [--hostingipnshash=]" echo " [--mainpageversion=]" + echo " [--push=]" echo "" echo "OPTIONS" echo "" - echo " -l, --languagecode string - the language of the wikimedia property e.g. tr - turkish, en - english" - echo " -w, --wikitype string - the type of the wikimedia property e.g. wikipedia, wikiquote" - echo " -d, --hostingdnsdomain string - the DNS domain name the mirror will be hosted at e.g. tr.wikipedia-on-ipfs.org" - echo " -i, --hostingipnshash string - the IPNS hash the mirror will be hosted at e.g. QmVH1VzGBydSfmNG7rmdDjAeBZ71UVeEahVbNpFQtwZK8W" - echo " -v, --mainpageversion string - an override hack used on Turkish Wikipedia, it sets the main page version as there are issues with the Kiwix version id" - - exit 2 + echo " -l, --languagecode string - the language of the wikimedia property e.g. tr - turkish, en - english" + echo " -w, --wikitype string - the type of the wikimedia property e.g. wikipedia, wikiquote" + echo " -t, --tag string - the tag of the wikimedia property e.g. all, top (defaults to all)" + echo " -e, --edition string - the edition of the wikimedia property e.g. maxi, mini (defaults to maxi)" + echo " -c, --date string - the date of the wikimedia property e.g. latest (defaults to latest)" + echo " -d, --hostingdnsdomain string - the DNS domain name the mirror will be hosted at e.g. tr.wikipedia-on-ipfs.org" + echo " -i, --hostingipnshash string - the IPNS hash the mirror will be hosted at e.g. QmVH1VzGBydSfmNG7rmdDjAeBZ71UVeEahVbNpFQtwZK8W" + echo " -v, --mainpageversion string - an override hack used on Turkish Wikipedia, it sets the main page version as there are issues with the Kiwix version id" + echo " -p, --push boolean - push to local ipfs instance (defaults to true)" + exit 2 } @@ -38,6 +44,18 @@ case $i in WIKI_TYPE="${i#*=}" shift ;; + -t=*|--tag=*) + TAG="${i#*=}" + shift + ;; + -e=*|--edition=*) + EDITION="${i#*=}" + shift + ;; + -c=*|--date=*) + DATE="${i#*=}" + shift + ;; -d=*|--hostingdnsdomain=*) HOSTING_DNS_DOMAIN="${i#*=}" shift @@ -50,6 +68,10 @@ case $i in MAIN_PAGE_VERSION="${i#*=}" shift ;; + -p=*|--push=*) + PUSH="${i#*=}" + shift + ;; --default) DEFAULT=YES shift @@ -70,6 +92,18 @@ if [ -z ${WIKI_TYPE+x} ]; then usage fi +if [ -z ${TAG+x} ]; then + TAG="all" +fi + +if [ -z ${EDITION+x} ]; then + EDITION="maxi" +fi + +if [ -z ${DATE+x} ]; then + DATE="latest" +fi + if [ -z ${HOSTING_DNS_DOMAIN+x} ]; then HOSTING_DNS_DOMAIN="" fi @@ -82,12 +116,16 @@ if [ -z ${MAIN_PAGE_VERSION+x} ]; then MAIN_PAGE_VERSION="" fi +if [ -z ${PUSH+x} ]; then + PUSH="true" +fi + printf "\nEnsure zimdump is present...\n" PATH=$PATH:$(realpath ./bin) which zimdump &> /dev/null || (curl --progress-bar -L https://download.openzim.org/release/zim-tools/zim-tools_linux-x86_64-3.0.0.tar.gz | tar -xvz --strip-components=1 -C ./bin zim-tools_linux-x86_64-3.0.0/zimdump && chmod +x ./bin/zimdump) printf "\nDownload and verify the zim file...\n" -ZIM_FILE_SOURCE_URL="$(./tools/getzim.sh download $WIKI_TYPE $WIKI_TYPE $LANGUAGE_CODE all maxi latest | grep 'URL:' | cut -d' ' -f3)" +ZIM_FILE_SOURCE_URL="$(./tools/getzim.sh download $WIKI_TYPE $WIKI_TYPE $LANGUAGE_CODE $TAG $EDITION $DATE | grep 'URL:' | cut -d' ' -f3)" ZIM_FILE=$(echo $ZIM_FILE_SOURCE_URL | rev | cut -d'/' -f1 | rev) TMP_DIRECTORY="./tmp/$(echo $ZIM_FILE | cut -d'.' -f1)" @@ -116,17 +154,11 @@ node ./bin/run $TMP_DIRECTORY \ ${HOSTING_IPNS_HASH:+--hostingipnshash=$HOSTING_IPNS_HASH} \ ${MAIN_PAGE_VERSION:+--mainpageversion=$MAIN_PAGE_VERSION} -printf "\n-------------------------\n" -printf "\nIPFS_PATH=$IPFS_PATH\n" - -printf "\nAdding the processed tmp directory to IPFS\n(this part may take long time on a slow disk):\n" -CID=$(ipfs add -r --cid-version 1 --pin=false --offline -Qp $TMP_DIRECTORY) -MFS_DIR="/${ZIM_FILE}__$(date +%F_%T)" - -# pin by adding to MFS under a meaningful name -ipfs files cp /ipfs/$CID "$MFS_DIR" - -printf "\n\n-------------------------\nD O N E !\n-------------------------\n" -printf "MFS: $MFS_DIR\n" -printf "CID: $CID" -printf "\n-------------------------\n" +if [[ "$PUSH" == "true" ]]; then + ./tools/add_website_to_ipfs.sh "$ZIM_FILE" "$TMP_DIRECTORY" "-p" +else + printf "\n\n-------------------------\nD O N E !\n-------------------------\n" + printf "ZIM: $ZIM_FILE\n" + printf "TMP: $TMP_DIRECTORY" + printf "\n-------------------------\n" +fi diff --git a/tools/add_website_to_ipfs.sh b/tools/add_website_to_ipfs.sh new file mode 100755 index 0000000..3c23cd8 --- /dev/null +++ b/tools/add_website_to_ipfs.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +set -euo pipefail + +usage() { + echo "USAGE:" + echo " $0 []"; + echo "" + exit 2 +} + +if [ -z "${1-}" ]; then + echo "Missing main page name (eg. Main_Page.html) " + usage +fi + +if [ -z "${2-}" ]; then + echo "Missing unpacked zim dir (eg. ./out) " + usage +fi + +ZIM_FILE=$1 +TMP_DIRECTORY=$2 +EXTRA_FLAGS=${3:-} + +printf "\nAdding the processed tmp directory to IPFS\n(this part may take long time on a slow disk):\n" +CID=$(ipfs add -r --cid-version 1 --pin=false --offline -Q $EXTRA_FLAGS $TMP_DIRECTORY) +MFS_DIR="/${ZIM_FILE}__$(date +%F_%T)" + +# pin by adding to MFS under a meaningful name +ipfs files cp /ipfs/$CID "$MFS_DIR" + +printf "\n\n-------------------------\nD O N E !\n-------------------------\n" +printf "MFS: $MFS_DIR\n" +printf "CID: $CID" +printf "\n-------------------------\n" diff --git a/tools/find_original_main_page_url.sh b/tools/find_original_main_page_url.sh deleted file mode 100755 index fe3ea17..0000000 --- a/tools/find_original_main_page_url.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash -# vim: set ts=2 sw=2: - -set -euo pipefail - -# Landing pages shipping with ZIM file are either truncated or Kiwix-specific. -# This script finds the URL of original version of the langing page -# mathing the timestamp of snapshot in unpacked ZIM directory - -usage() { - echo "USAGE:" - echo " $0
"; - echo "" - exit 2 -} - -if [ -z "${1-}" ]; then - echo "Missing main page name (eg. Main_Page.html) " - usage -fi - -if [ -z "${2-}" ]; then - echo "Missing unpacked zim dir (eg. ./out) " - usage -fi - -MAIN_PAGE=$1 -ZIM_ROOT=$2 - -SNAPSHOT_URL=$(grep -io 'https://[^"]*oldid=[^"]*' "$ZIM_ROOT/A/$MAIN_PAGE") - -echo $SNAPSHOT_URL From a59d9bda445cd66713200235ba22bb9ed048bd9a Mon Sep 17 00:00:00 2001 From: galargh Date: Mon, 14 Mar 2022 16:23:03 +0100 Subject: [PATCH 2/9] feat: update Dockerfile and remove ipfs from it --- Dockerfile | 54 ++++++++++++++++++-------------------- tools/docker_entrypoint.sh | 11 ++++++++ 2 files changed, 37 insertions(+), 28 deletions(-) create mode 100755 tools/docker_entrypoint.sh diff --git a/Dockerfile b/Dockerfile index a78e2aa..7490031 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,28 +1,26 @@ -FROM debian:stable - -ENV DEBIAN_FRONTEND=noninteractive - -RUN apt update -RUN apt -y install --no-install-recommends git ca-certificates curl wget apt-utils - -# install: -# - node and yarn -# - go-ipfs -RUN curl -sL https://deb.nodesource.com/setup_14.x -o nodesource_setup.sh \ - && bash nodesource_setup.sh \ - && apt -y install --no-install-recommends nodejs \ - && npm install -g yarn \ - && wget -nv https://dist.ipfs.io/go-ipfs/v0.8.0/go-ipfs_v0.8.0_linux-amd64.tar.gz \ - && tar xvfz go-ipfs_v0.8.0_linux-amd64.tar.gz \ - && mv go-ipfs/ipfs /usr/local/bin/ipfs \ - && rm -r go-ipfs && rm go-ipfs_v0.8.0_linux-amd64.tar.gz \ - && ipfs init -p server,local-discovery,flatfs,randomports --empty-repo \ - && ipfs config --json 'Experimental.ShardingEnabled' true - -# TODO: move repo init after external volume is mounted - -ENV DEBIAN_FRONTEND=dialog - -RUN mkdir /root/distributed-wikipedia-mirror -VOLUME ["/root/distributed-wikipedia-mirror"] -WORKDIR /root/distributed-wikipedia-mirror +# docker build . -f Dockerfile -t distributed-wikipedia-mirror +# docker run --rm -v $(pwd)/snapshots:/github/workspace/snapshots -v $(pwd)/tmp:/github/workspace/tmp distributed-wikipedia-mirror + +FROM openzim/zim-tools:3.1.0 AS openzim + +FROM node:16.14.0-buster-slim + +RUN apt update && apt upgrade && apt install -y curl wget rsync + +COPY --from=openzim /usr/local/bin/zimdump /usr/local/bin + +COPY tools/docker_entrypoint.sh /usr/local/bin + +RUN mkdir -p /github/distributed-wikipedia-mirror +RUN mkdir -p /github/distributed-wikipedia-mirror/snapshots +RUN mkdir -p /github/distributed-wikipedia-mirror/tmp +RUN mkdir -p /github/workspace + +COPY . /github/distributed-wikipedia-mirror + +RUN cd /github/distributed-wikipedia-mirror && yarn + +VOLUME [ "/github/workspace" ] + +WORKDIR /github/distributed-wikipedia-mirror +ENTRYPOINT [ "docker_entrypoint.sh" ] diff --git a/tools/docker_entrypoint.sh b/tools/docker_entrypoint.sh new file mode 100755 index 0000000..ed243bb --- /dev/null +++ b/tools/docker_entrypoint.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +set -euo pipefail + +pushd /github/distributed-wikipedia-mirror +./mirrorzim.sh "$@" "--push=false" +mkdir -p /github/workspace/snapshots +mkdir -p /github/workspace/tmp +mv snapshots/* /github/workspace/snapshots +mv tmp/* /github/workspace/tmp +popd From 59fe1a2d203c6eecad72263b78c0b99cc36b438e Mon Sep 17 00:00:00 2001 From: galargh Date: Mon, 14 Mar 2022 16:24:46 +0100 Subject: [PATCH 3/9] feat: create gh action which creates a website --- action.yml | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 action.yml diff --git a/action.yml b/action.yml new file mode 100644 index 0000000..ce3f0dd --- /dev/null +++ b/action.yml @@ -0,0 +1,50 @@ +name: Build +description: Download a zim file, unpack it, convert to website +inputs: + language-code: + description: 'the language of the wikimedia property e.g. tr - turkish, en - english' + required: true + default: 'en' + wiki-type: + description: 'the type of the wikimedia property e.g. wikipedia, wikiquote' + required: true + default: 'wikipedia' + tag: + description: 'the tag of the wikimedia property e.g. all, top' + required: true + default: 'all' + edition: + description: 'the edition of the wikimedia property e.g. maxi, mini' + required: true + default: 'maxi' + date: + description: 'the date of the wikimedia property e.g. latest' + required: true + default: 'latest' + hosting-dns-domain: + description: 'the DNS domain name the mirror will be hosted at e.g. tr.wikipedia-on-ipfs.org' + required: false + default: '' + hosting-ipns-hash: + description: 'the IPNS hash the mirror will be hosted at e.g. QmVH1VzGBydSfmNG7rmdDjAeBZ71UVeEahVbNpFQtwZK8W' + required: false + default: '' + main-page-version: + description: 'an override hack used on Turkish Wikipedia, it sets the main page version as there are issues with the Kiwix version id' + required: false + default: '' +outputs: + time: # id of output + description: 'The time we greeted you' +runs: + using: docker + image: Dockerfile + args: + - '--languagecode=${{ inputs.language-code }}' + - '--wikitype=${{ inputs.wiki-type }}' + - '--tag=${{ inputs.tag }}' + - '--edition=${{ inputs.edition }}' + - '--date=${{ inputs.date }}' + - '--hostingdnsdomain=${{ inputs.hosting-dns-domain }}' + - '--hostingipnshash=${{ inputs.hosting-ipns-hash }}' + - '--mainpageversion=${{ inputs.main-page-version }}' From 2732f84c9f32060943e8325595d0a4b2d2963457 Mon Sep 17 00:00:00 2001 From: galargh Date: Mon, 14 Mar 2022 16:26:03 +0100 Subject: [PATCH 4/9] feat: create gh workflow which creates a website and puts it on s3 --- .github/workflows/build.yml | 72 ++++++++++++++++++++++++++++++++ tools/publish_website_from_s3.sh | 21 ++++++++++ 2 files changed, 93 insertions(+) create mode 100644 .github/workflows/build.yml create mode 100755 tools/publish_website_from_s3.sh diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 0000000..28b343d --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,72 @@ +name: Build + +on: + workflow_dispatch: + inputs: + language-code: + description: 'the language of the wikimedia property e.g. tr - turkish, en - english' + required: true + default: 'en' + wiki-type: + description: 'the type of the wikimedia property e.g. wikipedia, wikiquote' + required: true + default: 'wikipedia' + tag: + description: 'the tag of the wikimedia property e.g. all, top' + required: true + default: 'all' + edition: + description: 'the edition of the wikimedia property e.g. maxi, mini' + required: true + default: 'maxi' + date: + description: 'the date of the wikimedia property e.g. latest' + required: true + default: 'latest' + hosting-dns-domain: + description: 'the DNS domain name the mirror will be hosted at e.g. tr.wikipedia-on-ipfs.org' + required: false + default: '' + hosting-ipns-hash: + description: 'the IPNS hash the mirror will be hosted at e.g. QmVH1VzGBydSfmNG7rmdDjAeBZ71UVeEahVbNpFQtwZK8W' + required: false + default: '' + main-page-version: + description: 'an override hack used on Turkish Wikipedia, it sets the main page version as there are issues with the Kiwix version id' + required: false + default: '' + +jobs: + build: + runs-on: ubuntu-latest + env: + AWS_S3_BUCKET: wikipedia-on-ipfs + AWS_REGION: eu-central-1 + steps: + - uses: actions/checkout@v2 + - uses: ./ + with: + language-code: ${{ github.event.inputs.language-code }} + wiki-type: ${{ github.event.inputs.wiki-type }} + tag: ${{ github.event.inputs.tag }} + edition: ${{ github.event.inputs.edition }} + date: ${{ github.event.inputs.date }} + hosting-dns-domain: ${{ github.event.inputs.hosting-dns-domain }} + hosting-ipns-hash: ${{ github.event.inputs.hosting-ipns-hash }} + main-page-version: ${{ github.event.inputs.main-page-version }} + - run: | + sudo chown -R $USER tmp + cd tmp + for d in *; do + if [[ -d "${d}" ]]; then + echo "Processing ${d} ..." + tar -czf "${d}.tar.gz" "${d}" + aws s3 cp "${d}.tar.gz" "s3://${{ env.AWS_S3_BUCKET }}/website-packages/${d}.tar.gz" \ + --acl 'public-read' --metadata "Name=${d},Url=${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + echo "::notice name=You can now publish $d::publish_website_from_s3.sh '${d}'" + fi + done + shell: bash + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} diff --git a/tools/publish_website_from_s3.sh b/tools/publish_website_from_s3.sh new file mode 100755 index 0000000..bf04ff2 --- /dev/null +++ b/tools/publish_website_from_s3.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +set -euo pipefail + +usage() { + echo "USAGE:" + echo " $0 "; + echo "" + exit 2 +} + +if [ -z "${1-}" ]; then + echo "Missing website name (eg. wikipedia_be_all_maxi_2022-03) " + usage +fi + +WEBSITE_NAME=$1 + +wget "https://wikipedia-on-ipfs.s3.eu-central-1.amazonaws.com/website-packages/${WEBSITE_NAME}.tar.gz" +tar -xzf "${WEBSITE_NAME}.tar.gz" +add_website_to_ipfs.sh "${WEBSITE_NAME}.zim" "${WEBSITE_NAME}" From 59a2937814511807e6c46d2217cf663304be1935 Mon Sep 17 00:00:00 2001 From: galargh Date: Mon, 14 Mar 2022 16:27:42 +0100 Subject: [PATCH 5/9] feat: automate S3 bucket + EC2 creation --- packer/provisioner.sh | 50 +++++++++++++ packer/wikipedia-on-ipfs.pkr.hcl | 54 ++++++++++++++ terraform/.gitignore | 4 ++ terraform/outputs.tf | 3 + terraform/providers.tf | 1 + terraform/resources.tf | 117 +++++++++++++++++++++++++++++++ terraform/terraform.tf | 10 +++ terraform/variables.tf | 11 +++ tools/start_ipfs.sh | 12 ++++ 9 files changed, 262 insertions(+) create mode 100644 packer/provisioner.sh create mode 100644 packer/wikipedia-on-ipfs.pkr.hcl create mode 100644 terraform/.gitignore create mode 100644 terraform/outputs.tf create mode 100644 terraform/providers.tf create mode 100644 terraform/resources.tf create mode 100644 terraform/terraform.tf create mode 100644 terraform/variables.tf create mode 100755 tools/start_ipfs.sh diff --git a/packer/provisioner.sh b/packer/provisioner.sh new file mode 100644 index 0000000..a0fa050 --- /dev/null +++ b/packer/provisioner.sh @@ -0,0 +1,50 @@ +#!/bin/bash + +set -euo pipefail + +sudo sysctl -w net.core.rmem_max=2500000 +ulimit -n 65536 + +pushd /tmp + +# Put tools on path +sudo cp tools/start_ipfs.sh /usr/local/bin/start_ipfs.sh +sudo cp tools/add_website_to_ipfs.sh /usr/local/bin/add_website_to_ipfs.sh +sudo cp tools/publish_website_from_s3.sh /usr/local/bin/publish_website_from_s3.sh + +# Install jq +wget https://github.com/stedolan/jq/releases/download/jq-1.6/jq-linux64 +sudo cp jq-linux64 /usr/local/bin/jq +sudo chmod 755 /usr/local/bin/jq + +# Install ipfs +wget https://dist.ipfs.io/go-ipfs/v0.12.0/go-ipfs_v0.12.0_linux-amd64.tar.gz +tar -xvzf go-ipfs_v0.12.0_linux-amd64.tar.gz +sudo go-ipfs/install.sh + +# Install unzip +sudo apt install -y unzip + +# Install awscli +wget https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip +unzip awscli-exe-linux-x86_64.zip +sudo aws/install + +# Create ipfs service +echo " +[Unit] +Description=IPFS daemon service. + +[Service] +Type=simple +User=ubuntu +ExecStart=start_ipfs.sh + +[Install] +WantedBy=multi-user.target +" | sudo tee /etc/systemd/system/ipfs.service +sudo chmod 644 /etc/systemd/system/ipfs.service +sudo systemctl daemon-reload +sudo systemctl enable ipfs + +popd diff --git a/packer/wikipedia-on-ipfs.pkr.hcl b/packer/wikipedia-on-ipfs.pkr.hcl new file mode 100644 index 0000000..d7a88ac --- /dev/null +++ b/packer/wikipedia-on-ipfs.pkr.hcl @@ -0,0 +1,54 @@ +# --> wikipedia-on-ipfs.amazon-ebs.wikipedia-on-ipfs: AMIs were created: +# eu-central-1: ami-09f8b0969385baafa + +packer { + required_plugins { + amazon = { + version = ">= 0.0.2" + source = "github.com/hashicorp/amazon" + } + } +} + +variable "aws-region" { + type = string + default = env("AWS_REGION") +} + +source "amazon-ebs" "wikipedia-on-ipfs" { + ami_name = "wikipedia-on-ipfs/timestamp/{{timestamp}}" + instance_type = "t3.micro" + region = "${var.aws-region}" + source_ami_filter { + filters = { + name = "ubuntu/images/*ubuntu-focal-20.04-amd64-server-*" + root-device-type = "ebs" + virtualization-type = "hvm" + } + most_recent = true + owners = ["099720109477"] + } + ssh_username = "ubuntu" + tags = { + OS_Version = "Ubuntu" + Release = "Latest" + Base_AMI_ID = "{{ .SourceAMI }}" + Base_AMI_Name = "{{ .SourceAMIName }}" + } +} + +build { + name = "wikipedia-on-ipfs" + sources = [ + "source.amazon-ebs.wikipedia-on-ipfs" + ] + + provisioner "file" { + source = "../tools" + destination = "/tmp/tools" + } + + provisioner "shell" { + script = "provisioner.sh" + } +} diff --git a/terraform/.gitignore b/terraform/.gitignore new file mode 100644 index 0000000..5def054 --- /dev/null +++ b/terraform/.gitignore @@ -0,0 +1,4 @@ +.terraform +.terraform.lock.hcl +terraform.tfstate +terraform.tfstate.backup diff --git a/terraform/outputs.tf b/terraform/outputs.tf new file mode 100644 index 0000000..87fc147 --- /dev/null +++ b/terraform/outputs.tf @@ -0,0 +1,3 @@ +output "ssh_destination" { + value = "ubuntu@${aws_instance.wikipedia-on-ipfs.public_dns}" +} diff --git a/terraform/providers.tf b/terraform/providers.tf new file mode 100644 index 0000000..b21d3b6 --- /dev/null +++ b/terraform/providers.tf @@ -0,0 +1 @@ +provider "aws" {} diff --git a/terraform/resources.tf b/terraform/resources.tf new file mode 100644 index 0000000..af2dc8d --- /dev/null +++ b/terraform/resources.tf @@ -0,0 +1,117 @@ +resource "aws_s3_bucket" "wikipedia-on-ipfs" { + bucket = "wikipedia-on-ipfs" + + tags = { + Name = "wikipedia-on-ipfs" + Url = "https://github.com/ipfs/distributed-wikipedia-mirror" + } +} + +resource "aws_iam_user" "wikipedia-on-ipfs" { + name = "wikipedia-on-ipfs" + + tags = { + Name = "wikipedia-on-ipfs" + Url = "https://github.com/ipfs/distributed-wikipedia-mirror" + } +} + +data "aws_iam_policy_document" "wikipedia-on-ipfs" { + statement { + actions = [ + "s3:GetObject", + "s3:PutObject", + "s3:DeleteObject", + "s3:PutObjectAcl", + ] + + resources = ["${aws_s3_bucket.wikipedia-on-ipfs.arn}", "${aws_s3_bucket.wikipedia-on-ipfs.arn}/*"] + effect = "Allow" + } +} + +resource "aws_iam_user_policy" "wikipedia-on-ipfs" { + name = "wikipedia-on-ipfs" + user = "${aws_iam_user.wikipedia-on-ipfs.name}" + + policy = "${data.aws_iam_policy_document.wikipedia-on-ipfs.json}" +} + +data "aws_ami" "wikipedia-on-ipfs" { + most_recent = true + + filter { + name = "name" + values = ["wikipedia-on-ipfs/*"] + } + + owners = ["${var.ami_owner_id}"] +} + +resource "aws_security_group" "wikipedia-on-ipfs" { + name = "wikipedia-on-ipfs" + + ingress { + description = "SSH Access" + from_port = 22 + to_port = 22 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + ipv6_cidr_blocks = ["::/0"] + } + + ingress { + description = "TCP Transport" + from_port = 4001 + to_port = 4001 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + ipv6_cidr_blocks = ["::/0"] + } + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + ipv6_cidr_blocks = ["::/0"] + } + + tags = { + Name = "wikipedia-on-ipfs" + Url = "https://github.com/ipfs/distributed-wikipedia-mirror" + } +} + +resource "aws_key_pair" "wikipedia-on-ipfs" { + key_name = "wikipedia-on-ipfs" + public_key = "${var.public_key}" +} + +resource "aws_instance" "wikipedia-on-ipfs" { + ami = data.aws_ami.wikipedia-on-ipfs.id + # t3.small doesn't have enough memory + instance_type = "t3.medium" + key_name = "${aws_key_pair.wikipedia-on-ipfs.key_name}" + + tags = { + Name = "wikipedia-on-ipfs" + Url = "https://github.com/ipfs/distributed-wikipedia-mirror" + } + + root_block_device { + volume_size = 100 + volume_type = "gp3" + + tags = { + Name = "wikipedia-on-ipfs" + Url = "https://github.com/ipfs/distributed-wikipedia-mirror" + } + } + + credit_specification { + cpu_credits = "standard" + } + + security_groups = ["${aws_security_group.wikipedia-on-ipfs.name}"] +} diff --git a/terraform/terraform.tf b/terraform/terraform.tf new file mode 100644 index 0000000..8e3f7fc --- /dev/null +++ b/terraform/terraform.tf @@ -0,0 +1,10 @@ +terraform { + required_providers { + amazon = { + source = "hashicorp/aws" + version = "4.5.0" + } + } + + required_version = "~> 1.1.4" +} diff --git a/terraform/variables.tf b/terraform/variables.tf new file mode 100644 index 0000000..ad86f7b --- /dev/null +++ b/terraform/variables.tf @@ -0,0 +1,11 @@ +variable "public_key" { + description = "SSH public key." + type = string + sensitive = true +} + +variable "ami_owner_id" { + description = "AMI owner ID." + type = string + default = "642361402189" +} diff --git a/tools/start_ipfs.sh b/tools/start_ipfs.sh new file mode 100755 index 0000000..1e024c9 --- /dev/null +++ b/tools/start_ipfs.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +set -euo pipefail + +if ! ipfs repo stat; then + ipfs init -p server,local-discovery,flatfs,randomports --empty-repo + ipfs config --json Experimental.AcceleratedDHTClient true + ipfs config --json 'Datastore.Spec.mounts' "$(ipfs config 'Datastore.Spec.mounts' | jq -c '.[0].child.sync=false')" + ipfs config Addresses.Swarm '["/ip4/0.0.0.0/tcp/4001","/ip6/::/tcp/4001"]' --json +fi + +ipfs daemon From 6a7dd3eefcead566f7b6c36c4c6894fa41bce41d Mon Sep 17 00:00:00 2001 From: galargh Date: Mon, 14 Mar 2022 16:28:26 +0100 Subject: [PATCH 6/9] fix: main page in case it was created from exception --- src/site-transforms.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/site-transforms.ts b/src/site-transforms.ts index 8dfdc2d..1f49a84 100644 --- a/src/site-transforms.ts +++ b/src/site-transforms.ts @@ -322,7 +322,7 @@ export const useKiwixLandingPage = async ( } // Fixup relative paths, if needed - const depth = (options.kiwixMainPage.match(/\//g) || []).length + const depth = (kiwixMainPageSrc.substring(wikiFolder.length + 1).match(/\//g) || []).length if (depth) { const fixRelativeLinksUp = (filePath: string, depth: number) => { const fileBytes = readFileSync(filePath) From 9da6496f6c790e0ef7ee4c3d094521a3fd3aa093 Mon Sep 17 00:00:00 2001 From: galargh Date: Mon, 14 Mar 2022 16:47:46 +0100 Subject: [PATCH 7/9] feat: document the changes --- Dockerfile | 12 ++++++++++-- README.md | 16 ++++++++++------ packer/README.md | 3 +++ terraform/README.md | 8 ++++++++ tools/add_website_to_ipfs.sh | 3 +++ tools/publish_website_from_s3.sh | 3 +++ tools/start_ipfs.sh | 3 +++ 7 files changed, 40 insertions(+), 8 deletions(-) create mode 100644 packer/README.md create mode 100644 terraform/README.md diff --git a/Dockerfile b/Dockerfile index 7490031..ef02b85 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,13 @@ -# docker build . -f Dockerfile -t distributed-wikipedia-mirror -# docker run --rm -v $(pwd)/snapshots:/github/workspace/snapshots -v $(pwd)/tmp:/github/workspace/tmp distributed-wikipedia-mirror +# This Dockerfile creates a self-contained image in which mirrorzim.sh can be executed +# +# You can build the image as follows (remember to use this repo as context for the build): +# docker build . -f Dockerfile -t distributed-wikipedia-mirror +# +# You can then run the container anywhere as follows +# docker run --rm -v $(pwd)/snapshots:/github/workspace/snapshots -v $(pwd)/tmp:/github/workspace/tmp distributed-wikipedia-mirror +# NOTE(s): +# - volume attached at /github/workspace/snapshots will contain downloaded zim files after the run +# - volume attached at /github/workspace/tmp will contain created website directories after the run FROM openzim/zim-tools:3.1.0 AS openzim diff --git a/README.md b/README.md index 58fe7ac..e12c1f3 100644 --- a/README.md +++ b/README.md @@ -136,7 +136,7 @@ This step won't be necessary when automatic sharding lands in go-ipfs (wip). ### Step 3: Download the latest snapshot from kiwix.org -Source of ZIM files is at https://download.kiwix.org/zim/wikipedia/ +Source of ZIM files is at https://download.kiwix.org/zim/wikipedia/ Make sure you download `_all_maxi_` snapshots, as those include images. To automate this, you can also use the `getzim.sh` script: @@ -164,8 +164,8 @@ $ zimdump dump ./snapshots/wikipedia_tr_all_maxi_2021-01.zim --dir ./tmp/wikiped > ### ℹ️ ZIM's main page > -> Each ZIM file has "main page" attribute which defines the landing page set for the ZIM archive. -> It is often different than the "main page" of upstream Wikipedia. +> Each ZIM file has "main page" attribute which defines the landing page set for the ZIM archive. +> It is often different than the "main page" of upstream Wikipedia. > Kiwix Main page needs to be passed in the next step, so until there is an automated way to determine "main page" of ZIM, you need to open ZIM in Kiwix reader and eyeball the name of the landing page. ### Step 5: Convert the unpacked zim directory to a website with mirror info @@ -242,7 +242,7 @@ Make sure at least two full reliable copies exist before updating DNSLink. ## mirrorzim.sh -It is possible to automate steps 3-6 via a wrapper script named `mirrorzim.sh`. +It is possible to automate steps 3-6 via a wrapper script named `mirrorzim.sh`. It will download the latest snapshot of specified language (if needed), unpack it, and add it to IPFS. To see how the script behaves try running it on one of the smallest wikis, such as `cu`: @@ -253,9 +253,9 @@ $ ./mirrorzim.sh --languagecode=cu --wikitype=wikipedia --hostingdnsdomain=cu.wi ## Docker build -A `Dockerfile` with all the software requirements is provided. +A `Dockerfile` with all the software requirements is provided. For now it is only a handy container for running the process on non-Linux -systems or if you don't want to pollute your system with all the dependencies. +systems or if you don't want to pollute your system with all the dependencies. In the future it will be end-to-end blackbox that takes ZIM and spits out CID and repo. @@ -340,3 +340,7 @@ We are working on improving deduplication between snapshots, but for now YMMV. ## Code If you would like to contribute more to this effort, look at the [issues](https://github.com/ipfs/distributed-wikipedia-mirror/issues) in this github repo. Especially check for [issues marked with the "wishlist" label](https://github.com/ipfs/distributed-wikipedia-mirror/labels/wishlist) and issues marked ["help wanted"](https://github.com/ipfs/distributed-wikipedia-mirror/labels/help%20wanted). + +## GitHub Actions Workflow + +The GitHub Actions workflow that is available in this repository takes information about the wiki website that you want to mirror, downloads its' zim, unpacks it, converts it to a website and uploads it to S3 as a tar.gz package which is publicly accessible. diff --git a/packer/README.md b/packer/README.md new file mode 100644 index 0000000..6dc0ee7 --- /dev/null +++ b/packer/README.md @@ -0,0 +1,3 @@ +Packer configuration that resides here creates AMI in which: +- ipfs service is started on machine boot +- `publish_website_from_s3.sh` is available diff --git a/terraform/README.md b/terraform/README.md new file mode 100644 index 0000000..c2605e4 --- /dev/null +++ b/terraform/README.md @@ -0,0 +1,8 @@ +Terraform configuration that resides here creates: +- S3 bucket where website packages can be uploaded +- EC2 instance which runs ipfs where `publish_website_from_s3.sh` can be run to publish mirrors + +To run `terraform` here you have to export: +- `TF_VAR_public_key` - public key which will be used to give you SSH access to EC2 +- `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` - creds to AWS account that have enough permissions to create the resources +- `AWS_REGION` - the name of the region where S3 and EC2 should be created diff --git a/tools/add_website_to_ipfs.sh b/tools/add_website_to_ipfs.sh index 3c23cd8..882933e 100755 --- a/tools/add_website_to_ipfs.sh +++ b/tools/add_website_to_ipfs.sh @@ -2,6 +2,9 @@ set -euo pipefail +# This script adds website that was created at +# from to ipfs + usage() { echo "USAGE:" echo " $0 []"; diff --git a/tools/publish_website_from_s3.sh b/tools/publish_website_from_s3.sh index bf04ff2..5ad4763 100755 --- a/tools/publish_website_from_s3.sh +++ b/tools/publish_website_from_s3.sh @@ -2,6 +2,9 @@ set -euo pipefail +# This scripts downloads from s3://wikipedia-on-ipfs, +# unpacks it and adds it to ipfs + usage() { echo "USAGE:" echo " $0 "; diff --git a/tools/start_ipfs.sh b/tools/start_ipfs.sh index 1e024c9..fa77932 100755 --- a/tools/start_ipfs.sh +++ b/tools/start_ipfs.sh @@ -2,6 +2,9 @@ set -euo pipefail +# This script starts ipfs daemon +# If ipfs was not initialised before, this script also initialises ipfs + if ! ipfs repo stat; then ipfs init -p server,local-discovery,flatfs,randomports --empty-repo ipfs config --json Experimental.AcceleratedDHTClient true From 4e05e28f8b309811b7614bbc87d69c98fa6fdc28 Mon Sep 17 00:00:00 2001 From: galargh Date: Mon, 14 Mar 2022 17:33:04 +0100 Subject: [PATCH 8/9] feat: update AMI --- packer/wikipedia-on-ipfs.pkr.hcl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packer/wikipedia-on-ipfs.pkr.hcl b/packer/wikipedia-on-ipfs.pkr.hcl index d7a88ac..943e3e6 100644 --- a/packer/wikipedia-on-ipfs.pkr.hcl +++ b/packer/wikipedia-on-ipfs.pkr.hcl @@ -1,5 +1,5 @@ # --> wikipedia-on-ipfs.amazon-ebs.wikipedia-on-ipfs: AMIs were created: -# eu-central-1: ami-09f8b0969385baafa +# eu-central-1: ami-02ff7a8cff61c5d41 packer { required_plugins { From ca6090323a229b2645f53211ca48fab51d901d65 Mon Sep 17 00:00:00 2001 From: galargh Date: Wed, 16 Mar 2022 11:38:32 +0100 Subject: [PATCH 9/9] simplify AWS setup --- .github/workflows/build.yml | 72 ---------------------------- .gitignore | 5 ++ Dockerfile | 48 +++++++++---------- README.md | 24 +++++----- action.yml | 50 -------------------- mirrorzim.sh | 29 +++++------- packer/README.md | 3 -- packer/provisioner.sh | 50 -------------------- packer/wikipedia-on-ipfs.pkr.hcl | 54 --------------------- terraform/.gitignore | 4 -- terraform/README.md | 58 +++++++++++++++++++---- terraform/ec2/outputs.tf | 3 ++ terraform/{ => ec2}/providers.tf | 0 terraform/{ => ec2}/resources.tf | 80 +++++++++++++------------------- terraform/{ => ec2}/terraform.tf | 3 +- terraform/ec2/variables.tf | 23 +++++++++ terraform/ecr/outputs.tf | 15 ++++++ terraform/ecr/providers.tf | 3 ++ terraform/ecr/resources.tf | 3 ++ terraform/ecr/terraform.tf | 9 ++++ terraform/outputs.tf | 3 -- terraform/variables.tf | 11 ----- tools/add_website_to_ipfs.sh | 39 ---------------- tools/docker_entrypoint.sh | 11 ----- tools/entrypoint.sh | 11 +++++ tools/publish_website_from_s3.sh | 24 ---------- tools/start_ipfs.sh | 15 ------ 27 files changed, 201 insertions(+), 449 deletions(-) delete mode 100644 .github/workflows/build.yml delete mode 100644 action.yml delete mode 100644 packer/README.md delete mode 100644 packer/provisioner.sh delete mode 100644 packer/wikipedia-on-ipfs.pkr.hcl delete mode 100644 terraform/.gitignore create mode 100644 terraform/ec2/outputs.tf rename terraform/{ => ec2}/providers.tf (100%) rename terraform/{ => ec2}/resources.tf (52%) rename terraform/{ => ec2}/terraform.tf (68%) create mode 100644 terraform/ec2/variables.tf create mode 100644 terraform/ecr/outputs.tf create mode 100644 terraform/ecr/providers.tf create mode 100644 terraform/ecr/resources.tf create mode 100644 terraform/ecr/terraform.tf delete mode 100644 terraform/outputs.tf delete mode 100644 terraform/variables.tf delete mode 100755 tools/add_website_to_ipfs.sh delete mode 100755 tools/docker_entrypoint.sh create mode 100755 tools/entrypoint.sh delete mode 100755 tools/publish_website_from_s3.sh delete mode 100755 tools/start_ipfs.sh diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml deleted file mode 100644 index 28b343d..0000000 --- a/.github/workflows/build.yml +++ /dev/null @@ -1,72 +0,0 @@ -name: Build - -on: - workflow_dispatch: - inputs: - language-code: - description: 'the language of the wikimedia property e.g. tr - turkish, en - english' - required: true - default: 'en' - wiki-type: - description: 'the type of the wikimedia property e.g. wikipedia, wikiquote' - required: true - default: 'wikipedia' - tag: - description: 'the tag of the wikimedia property e.g. all, top' - required: true - default: 'all' - edition: - description: 'the edition of the wikimedia property e.g. maxi, mini' - required: true - default: 'maxi' - date: - description: 'the date of the wikimedia property e.g. latest' - required: true - default: 'latest' - hosting-dns-domain: - description: 'the DNS domain name the mirror will be hosted at e.g. tr.wikipedia-on-ipfs.org' - required: false - default: '' - hosting-ipns-hash: - description: 'the IPNS hash the mirror will be hosted at e.g. QmVH1VzGBydSfmNG7rmdDjAeBZ71UVeEahVbNpFQtwZK8W' - required: false - default: '' - main-page-version: - description: 'an override hack used on Turkish Wikipedia, it sets the main page version as there are issues with the Kiwix version id' - required: false - default: '' - -jobs: - build: - runs-on: ubuntu-latest - env: - AWS_S3_BUCKET: wikipedia-on-ipfs - AWS_REGION: eu-central-1 - steps: - - uses: actions/checkout@v2 - - uses: ./ - with: - language-code: ${{ github.event.inputs.language-code }} - wiki-type: ${{ github.event.inputs.wiki-type }} - tag: ${{ github.event.inputs.tag }} - edition: ${{ github.event.inputs.edition }} - date: ${{ github.event.inputs.date }} - hosting-dns-domain: ${{ github.event.inputs.hosting-dns-domain }} - hosting-ipns-hash: ${{ github.event.inputs.hosting-ipns-hash }} - main-page-version: ${{ github.event.inputs.main-page-version }} - - run: | - sudo chown -R $USER tmp - cd tmp - for d in *; do - if [[ -d "${d}" ]]; then - echo "Processing ${d} ..." - tar -czf "${d}.tar.gz" "${d}" - aws s3 cp "${d}.tar.gz" "s3://${{ env.AWS_S3_BUCKET }}/website-packages/${d}.tar.gz" \ - --acl 'public-read' --metadata "Name=${d},Url=${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" - echo "::notice name=You can now publish $d::publish_website_from_s3.sh '${d}'" - fi - done - shell: bash - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} diff --git a/.gitignore b/.gitignore index ddeef37..e141398 100644 --- a/.gitignore +++ b/.gitignore @@ -19,3 +19,8 @@ node_modules /kiwix-tools bin/zimdump + +*.tfstate +*.tfstate.* +*.terraform +*.terraform.* diff --git a/Dockerfile b/Dockerfile index ef02b85..92472e7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,34 +1,34 @@ -# This Dockerfile creates a self-contained image in which mirrorzim.sh can be executed +# This Dockerfile creates a self-contained image in which mirrorzim.sh can be executed. +# It also runs ipfs daemon. # # You can build the image as follows (remember to use this repo as context for the build): -# docker build . -f Dockerfile -t distributed-wikipedia-mirror +# docker build . --platform=linux/amd64 -f Dockerfile -t distributed-wikipedia-mirror # -# You can then run the container anywhere as follows -# docker run --rm -v $(pwd)/snapshots:/github/workspace/snapshots -v $(pwd)/tmp:/github/workspace/tmp distributed-wikipedia-mirror -# NOTE(s): -# - volume attached at /github/workspace/snapshots will contain downloaded zim files after the run -# - volume attached at /github/workspace/tmp will contain created website directories after the run +# You can then run the container anywhere as follows: +# docker run --ulimit nofile=65536:65536 -p 4001:4001/tcp -p 4001:4001/udp distributed-wikipedia-mirror -FROM openzim/zim-tools:3.1.0 AS openzim +FROM stedolan/jq:latest AS jq +FROM openzim/zim-tools:3.1.0 AS zimdump +FROM ipfs/go-ipfs:v0.12.0 AS ipfs +FROM node:16 -FROM node:16.14.0-buster-slim +RUN apt-get update && apt-get install --no-install-recommends --assume-yes rsync moreutils -RUN apt update && apt upgrade && apt install -y curl wget rsync +COPY --from=jq /usr/local/bin/jq /usr/local/bin/ +COPY --from=zimdump /usr/local/bin/zimdump /usr/local/bin/ +COPY --from=ipfs /usr/local/bin/ipfs /usr/local/bin/ -COPY --from=openzim /usr/local/bin/zimdump /usr/local/bin +COPY assets /root/assets +COPY bin /root/bin +COPY src /root/src +COPY tools /root/tools +COPY mirrorzim.sh package.json tsconfig.json /root/ -COPY tools/docker_entrypoint.sh /usr/local/bin +RUN mkdir /root/snapshots /root/tmp +RUN cd /root && yarn -RUN mkdir -p /github/distributed-wikipedia-mirror -RUN mkdir -p /github/distributed-wikipedia-mirror/snapshots -RUN mkdir -p /github/distributed-wikipedia-mirror/tmp -RUN mkdir -p /github/workspace +EXPOSE 4001/tcp +EXPOSE 4001/udp -COPY . /github/distributed-wikipedia-mirror - -RUN cd /github/distributed-wikipedia-mirror && yarn - -VOLUME [ "/github/workspace" ] - -WORKDIR /github/distributed-wikipedia-mirror -ENTRYPOINT [ "docker_entrypoint.sh" ] +WORKDIR /root +ENTRYPOINT [ "tools/entrypoint.sh" ] diff --git a/README.md b/README.md index e12c1f3..621a2af 100644 --- a/README.md +++ b/README.md @@ -254,21 +254,25 @@ $ ./mirrorzim.sh --languagecode=cu --wikitype=wikipedia --hostingdnsdomain=cu.wi ## Docker build A `Dockerfile` with all the software requirements is provided. -For now it is only a handy container for running the process on non-Linux -systems or if you don't want to pollute your system with all the dependencies. -In the future it will be end-to-end blackbox that takes ZIM and spits out CID -and repo. +It is a handy container for running the process on non-Linux systems, if you don't want to pollute your system with all the dependencies or if you want to run the process in the cloud. +It is an end-to-end blackbox that takes mirrorzim.sh arguments, spits out CID and runs IPFS daemon. -To build the docker image: +To run the publicly available docker image: ```sh -docker build . -t distributed-wikipedia-mirror-build +docker run --ulimit nofile=65536:65536 -p 4001:4001/tcp -p 4001:4001/udp public.ecr.aws/c4h1q7d1/distributed-wikipedia-mirror:latest ``` -To use it as a development environment: +Alternatively, to build the docker image: ```sh -docker run -it -v $(pwd):/root/distributed-wikipedia-mirror --net=host --entrypoint bash distributed-wikipedia-mirror-build +docker build . --platform=linux/amd64 -f Dockerfile -t distributed-wikipedia-mirror +``` + +And then, to run it: + +```sh +docker run --ulimit nofile=65536:65536 -p 4001:4001/tcp -p 4001:4001/udp distributed-wikipedia-mirror ``` # How to Help @@ -340,7 +344,3 @@ We are working on improving deduplication between snapshots, but for now YMMV. ## Code If you would like to contribute more to this effort, look at the [issues](https://github.com/ipfs/distributed-wikipedia-mirror/issues) in this github repo. Especially check for [issues marked with the "wishlist" label](https://github.com/ipfs/distributed-wikipedia-mirror/labels/wishlist) and issues marked ["help wanted"](https://github.com/ipfs/distributed-wikipedia-mirror/labels/help%20wanted). - -## GitHub Actions Workflow - -The GitHub Actions workflow that is available in this repository takes information about the wiki website that you want to mirror, downloads its' zim, unpacks it, converts it to a website and uploads it to S3 as a tar.gz package which is publicly accessible. diff --git a/action.yml b/action.yml deleted file mode 100644 index ce3f0dd..0000000 --- a/action.yml +++ /dev/null @@ -1,50 +0,0 @@ -name: Build -description: Download a zim file, unpack it, convert to website -inputs: - language-code: - description: 'the language of the wikimedia property e.g. tr - turkish, en - english' - required: true - default: 'en' - wiki-type: - description: 'the type of the wikimedia property e.g. wikipedia, wikiquote' - required: true - default: 'wikipedia' - tag: - description: 'the tag of the wikimedia property e.g. all, top' - required: true - default: 'all' - edition: - description: 'the edition of the wikimedia property e.g. maxi, mini' - required: true - default: 'maxi' - date: - description: 'the date of the wikimedia property e.g. latest' - required: true - default: 'latest' - hosting-dns-domain: - description: 'the DNS domain name the mirror will be hosted at e.g. tr.wikipedia-on-ipfs.org' - required: false - default: '' - hosting-ipns-hash: - description: 'the IPNS hash the mirror will be hosted at e.g. QmVH1VzGBydSfmNG7rmdDjAeBZ71UVeEahVbNpFQtwZK8W' - required: false - default: '' - main-page-version: - description: 'an override hack used on Turkish Wikipedia, it sets the main page version as there are issues with the Kiwix version id' - required: false - default: '' -outputs: - time: # id of output - description: 'The time we greeted you' -runs: - using: docker - image: Dockerfile - args: - - '--languagecode=${{ inputs.language-code }}' - - '--wikitype=${{ inputs.wiki-type }}' - - '--tag=${{ inputs.tag }}' - - '--edition=${{ inputs.edition }}' - - '--date=${{ inputs.date }}' - - '--hostingdnsdomain=${{ inputs.hosting-dns-domain }}' - - '--hostingipnshash=${{ inputs.hosting-ipns-hash }}' - - '--mainpageversion=${{ inputs.main-page-version }}' diff --git a/mirrorzim.sh b/mirrorzim.sh index 633beac..299a26e 100755 --- a/mirrorzim.sh +++ b/mirrorzim.sh @@ -16,7 +16,6 @@ usage() { echo " [--hostingdnsdomain=]" echo " [--hostingipnshash=]" echo " [--mainpageversion=]" - echo " [--push=]" echo "" echo "OPTIONS" echo "" @@ -28,7 +27,6 @@ usage() { echo " -d, --hostingdnsdomain string - the DNS domain name the mirror will be hosted at e.g. tr.wikipedia-on-ipfs.org" echo " -i, --hostingipnshash string - the IPNS hash the mirror will be hosted at e.g. QmVH1VzGBydSfmNG7rmdDjAeBZ71UVeEahVbNpFQtwZK8W" echo " -v, --mainpageversion string - an override hack used on Turkish Wikipedia, it sets the main page version as there are issues with the Kiwix version id" - echo " -p, --push boolean - push to local ipfs instance (defaults to true)" exit 2 } @@ -68,10 +66,6 @@ case $i in MAIN_PAGE_VERSION="${i#*=}" shift ;; - -p=*|--push=*) - PUSH="${i#*=}" - shift - ;; --default) DEFAULT=YES shift @@ -116,10 +110,6 @@ if [ -z ${MAIN_PAGE_VERSION+x} ]; then MAIN_PAGE_VERSION="" fi -if [ -z ${PUSH+x} ]; then - PUSH="true" -fi - printf "\nEnsure zimdump is present...\n" PATH=$PATH:$(realpath ./bin) which zimdump &> /dev/null || (curl --progress-bar -L https://download.openzim.org/release/zim-tools/zim-tools_linux-x86_64-3.0.0.tar.gz | tar -xvz --strip-components=1 -C ./bin zim-tools_linux-x86_64-3.0.0/zimdump && chmod +x ./bin/zimdump) @@ -154,11 +144,14 @@ node ./bin/run $TMP_DIRECTORY \ ${HOSTING_IPNS_HASH:+--hostingipnshash=$HOSTING_IPNS_HASH} \ ${MAIN_PAGE_VERSION:+--mainpageversion=$MAIN_PAGE_VERSION} -if [[ "$PUSH" == "true" ]]; then - ./tools/add_website_to_ipfs.sh "$ZIM_FILE" "$TMP_DIRECTORY" "-p" -else - printf "\n\n-------------------------\nD O N E !\n-------------------------\n" - printf "ZIM: $ZIM_FILE\n" - printf "TMP: $TMP_DIRECTORY" - printf "\n-------------------------\n" -fi +printf "\nAdding the processed tmp directory to IPFS\n(this part may take long time on a slow disk):\n" +CID=$(ipfs add -r --cid-version 1 --pin=false --offline -Q -p $TMP_DIRECTORY) +MFS_DIR="/${ZIM_FILE}__$(date +%F_%T)" + +# pin by adding to MFS under a meaningful name +ipfs files cp /ipfs/$CID "$MFS_DIR" + +printf "\n\n-------------------------\nD O N E !\n-------------------------\n" +printf "MFS: $MFS_DIR\n" +printf "CID: $CID" +printf "\n-------------------------\n" diff --git a/packer/README.md b/packer/README.md deleted file mode 100644 index 6dc0ee7..0000000 --- a/packer/README.md +++ /dev/null @@ -1,3 +0,0 @@ -Packer configuration that resides here creates AMI in which: -- ipfs service is started on machine boot -- `publish_website_from_s3.sh` is available diff --git a/packer/provisioner.sh b/packer/provisioner.sh deleted file mode 100644 index a0fa050..0000000 --- a/packer/provisioner.sh +++ /dev/null @@ -1,50 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -sudo sysctl -w net.core.rmem_max=2500000 -ulimit -n 65536 - -pushd /tmp - -# Put tools on path -sudo cp tools/start_ipfs.sh /usr/local/bin/start_ipfs.sh -sudo cp tools/add_website_to_ipfs.sh /usr/local/bin/add_website_to_ipfs.sh -sudo cp tools/publish_website_from_s3.sh /usr/local/bin/publish_website_from_s3.sh - -# Install jq -wget https://github.com/stedolan/jq/releases/download/jq-1.6/jq-linux64 -sudo cp jq-linux64 /usr/local/bin/jq -sudo chmod 755 /usr/local/bin/jq - -# Install ipfs -wget https://dist.ipfs.io/go-ipfs/v0.12.0/go-ipfs_v0.12.0_linux-amd64.tar.gz -tar -xvzf go-ipfs_v0.12.0_linux-amd64.tar.gz -sudo go-ipfs/install.sh - -# Install unzip -sudo apt install -y unzip - -# Install awscli -wget https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip -unzip awscli-exe-linux-x86_64.zip -sudo aws/install - -# Create ipfs service -echo " -[Unit] -Description=IPFS daemon service. - -[Service] -Type=simple -User=ubuntu -ExecStart=start_ipfs.sh - -[Install] -WantedBy=multi-user.target -" | sudo tee /etc/systemd/system/ipfs.service -sudo chmod 644 /etc/systemd/system/ipfs.service -sudo systemctl daemon-reload -sudo systemctl enable ipfs - -popd diff --git a/packer/wikipedia-on-ipfs.pkr.hcl b/packer/wikipedia-on-ipfs.pkr.hcl deleted file mode 100644 index 943e3e6..0000000 --- a/packer/wikipedia-on-ipfs.pkr.hcl +++ /dev/null @@ -1,54 +0,0 @@ -# --> wikipedia-on-ipfs.amazon-ebs.wikipedia-on-ipfs: AMIs were created: -# eu-central-1: ami-02ff7a8cff61c5d41 - -packer { - required_plugins { - amazon = { - version = ">= 0.0.2" - source = "github.com/hashicorp/amazon" - } - } -} - -variable "aws-region" { - type = string - default = env("AWS_REGION") -} - -source "amazon-ebs" "wikipedia-on-ipfs" { - ami_name = "wikipedia-on-ipfs/timestamp/{{timestamp}}" - instance_type = "t3.micro" - region = "${var.aws-region}" - source_ami_filter { - filters = { - name = "ubuntu/images/*ubuntu-focal-20.04-amd64-server-*" - root-device-type = "ebs" - virtualization-type = "hvm" - } - most_recent = true - owners = ["099720109477"] - } - ssh_username = "ubuntu" - tags = { - OS_Version = "Ubuntu" - Release = "Latest" - Base_AMI_ID = "{{ .SourceAMI }}" - Base_AMI_Name = "{{ .SourceAMIName }}" - } -} - -build { - name = "wikipedia-on-ipfs" - sources = [ - "source.amazon-ebs.wikipedia-on-ipfs" - ] - - provisioner "file" { - source = "../tools" - destination = "/tmp/tools" - } - - provisioner "shell" { - script = "provisioner.sh" - } -} diff --git a/terraform/.gitignore b/terraform/.gitignore deleted file mode 100644 index 5def054..0000000 --- a/terraform/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -.terraform -.terraform.lock.hcl -terraform.tfstate -terraform.tfstate.backup diff --git a/terraform/README.md b/terraform/README.md index c2605e4..7b2dd04 100644 --- a/terraform/README.md +++ b/terraform/README.md @@ -1,8 +1,50 @@ -Terraform configuration that resides here creates: -- S3 bucket where website packages can be uploaded -- EC2 instance which runs ipfs where `publish_website_from_s3.sh` can be run to publish mirrors - -To run `terraform` here you have to export: -- `TF_VAR_public_key` - public key which will be used to give you SSH access to EC2 -- `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` - creds to AWS account that have enough permissions to create the resources -- `AWS_REGION` - the name of the region where S3 and EC2 should be created +This directory contains 2 terraform configurations: +- `ec2`: for creating an EC2 instance with IPFS ports exposed to public and docker installed +- `ecr`: for creating a public ECR repository which can be used to store `distributed-wikipedia-mirror` images + +The terraform configurations expect the following environment variables: +- `AWS_REGION` (ec2 only): the region to create the resources in +- `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`: the credentials to the account to create the resources in + +###### How to publish a new distributed wikipedia mirror on a new instance? + +1. Create a new instance instance. + ```bash + cd terraform/ec2 + terraform apply + ``` +1. SSH to the newly created instance. The exact command will be printed as an output of `terraform apply`. + ```bash + ssh -i ec2-user@ + ``` +1. Create a new distributed wikipedia mirror. + ```bash + docker run --name wikipedia-on-ipfs --ulimit nofile=65536:65536 -d -p 4001:4001/tcp -p 4001:4001/udp public.ecr.aws/c4h1q7d1/distributed-wikipedia-mirror:latest + ``` +1. Find the CID of the newly created distributed wikipedia mirror. It might take a while for it to become available. + ```bash + docker logs wikipedia-on-ipfs + ``` + +###### How to create a new ECR repository? + +It will print out a bunch of useful commands that should be updated in the docs. +```bash +cd terraform/ecr +terraform apply +``` + +###### How to create a new docker image? + +1. Log in to the ECR. + ```bash + aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/c4h1q7d1/distributed-wikipedia-mirror + ``` +1. Build a new docker image. + ```bash + docker build . --platform=linux/amd64 -f Dockerfile -t public.ecr.aws/c4h1q7d1/distributed-wikipedia-mirror -t public.ecr.aws/c4h1q7d1/distributed-wikipedia-mirror:$(date -u +%F) -t public.ecr.aws/c4h1q7d1/distributed-wikipedia-mirror:$(date -u +%s) + ``` +1. Push the newly created docker image. + ```bash + docker push --all-tags public.ecr.aws/c4h1q7d1/distributed-wikipedia-mirror + ``` diff --git a/terraform/ec2/outputs.tf b/terraform/ec2/outputs.tf new file mode 100644 index 0000000..be3cf19 --- /dev/null +++ b/terraform/ec2/outputs.tf @@ -0,0 +1,3 @@ +output "ssh_command" { + value = "ssh -i ec2-user@${aws_instance.this.public_dns}" +} diff --git a/terraform/providers.tf b/terraform/ec2/providers.tf similarity index 100% rename from terraform/providers.tf rename to terraform/ec2/providers.tf diff --git a/terraform/resources.tf b/terraform/ec2/resources.tf similarity index 52% rename from terraform/resources.tf rename to terraform/ec2/resources.tf index af2dc8d..02200ae 100644 --- a/terraform/resources.tf +++ b/terraform/ec2/resources.tf @@ -1,54 +1,15 @@ -resource "aws_s3_bucket" "wikipedia-on-ipfs" { - bucket = "wikipedia-on-ipfs" - - tags = { - Name = "wikipedia-on-ipfs" - Url = "https://github.com/ipfs/distributed-wikipedia-mirror" - } -} - -resource "aws_iam_user" "wikipedia-on-ipfs" { - name = "wikipedia-on-ipfs" - - tags = { - Name = "wikipedia-on-ipfs" - Url = "https://github.com/ipfs/distributed-wikipedia-mirror" - } -} - -data "aws_iam_policy_document" "wikipedia-on-ipfs" { - statement { - actions = [ - "s3:GetObject", - "s3:PutObject", - "s3:DeleteObject", - "s3:PutObjectAcl", - ] - - resources = ["${aws_s3_bucket.wikipedia-on-ipfs.arn}", "${aws_s3_bucket.wikipedia-on-ipfs.arn}/*"] - effect = "Allow" - } -} - -resource "aws_iam_user_policy" "wikipedia-on-ipfs" { - name = "wikipedia-on-ipfs" - user = "${aws_iam_user.wikipedia-on-ipfs.name}" - - policy = "${data.aws_iam_policy_document.wikipedia-on-ipfs.json}" -} - -data "aws_ami" "wikipedia-on-ipfs" { +data "aws_ami" "this" { most_recent = true filter { name = "name" - values = ["wikipedia-on-ipfs/*"] + values = ["amzn2-ami-ecs-hvm-2.0.20220304-x86_64-ebs"] } - owners = ["${var.ami_owner_id}"] + owners = ["591542846629"] } -resource "aws_security_group" "wikipedia-on-ipfs" { +resource "aws_security_group" "this" { name = "wikipedia-on-ipfs" ingress { @@ -69,6 +30,15 @@ resource "aws_security_group" "wikipedia-on-ipfs" { ipv6_cidr_blocks = ["::/0"] } + ingress { + description = "UDP Transport" + from_port = 4001 + to_port = 4001 + protocol = "udp" + cidr_blocks = ["0.0.0.0/0"] + ipv6_cidr_blocks = ["::/0"] + } + egress { from_port = 0 to_port = 0 @@ -83,16 +53,21 @@ resource "aws_security_group" "wikipedia-on-ipfs" { } } -resource "aws_key_pair" "wikipedia-on-ipfs" { +resource "aws_key_pair" "this" { key_name = "wikipedia-on-ipfs" public_key = "${var.public_key}" + + tags = { + Name = "wikipedia-on-ipfs" + Url = "https://github.com/ipfs/distributed-wikipedia-mirror" + } } -resource "aws_instance" "wikipedia-on-ipfs" { - ami = data.aws_ami.wikipedia-on-ipfs.id +resource "aws_instance" "this" { + ami = data.aws_ami.this.id # t3.small doesn't have enough memory instance_type = "t3.medium" - key_name = "${aws_key_pair.wikipedia-on-ipfs.key_name}" + key_name = "${aws_key_pair.this.key_name}" tags = { Name = "wikipedia-on-ipfs" @@ -100,8 +75,10 @@ resource "aws_instance" "wikipedia-on-ipfs" { } root_block_device { - volume_size = 100 + volume_size = var.volume_size volume_type = "gp3" + iops = var.volume_iops + throughput = var.volume_throughput tags = { Name = "wikipedia-on-ipfs" @@ -113,5 +90,10 @@ resource "aws_instance" "wikipedia-on-ipfs" { cpu_credits = "standard" } - security_groups = ["${aws_security_group.wikipedia-on-ipfs.name}"] + security_groups = ["${aws_security_group.this.name}"] + + user_data = join("\n", [ + "#!/bin/bash", + "sysctl -w net.core.rmem_max=2500000" + ]) } diff --git a/terraform/terraform.tf b/terraform/ec2/terraform.tf similarity index 68% rename from terraform/terraform.tf rename to terraform/ec2/terraform.tf index 8e3f7fc..4001c13 100644 --- a/terraform/terraform.tf +++ b/terraform/ec2/terraform.tf @@ -1,7 +1,6 @@ terraform { required_providers { - amazon = { - source = "hashicorp/aws" + aws = { version = "4.5.0" } } diff --git a/terraform/ec2/variables.tf b/terraform/ec2/variables.tf new file mode 100644 index 0000000..7d924de --- /dev/null +++ b/terraform/ec2/variables.tf @@ -0,0 +1,23 @@ +variable "public_key" { + description = "SSH public key." + type = string + sensitive = true +} + +variable "volume_size" { + description = "Root block device volume size." + type = number + default = 100 +} + +variable "volume_iops" { + description = "Root block device volume IOPS." + type = number + default = 3000 +} + +variable "volume_throughput" { + description = "Root block device volume throughput (MiB/s)." + type = number + default = 125 +} diff --git a/terraform/ecr/outputs.tf b/terraform/ecr/outputs.tf new file mode 100644 index 0000000..ed344a9 --- /dev/null +++ b/terraform/ecr/outputs.tf @@ -0,0 +1,15 @@ +output "docker_login_command" { + value = "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin ${aws_ecrpublic_repository.this.repository_uri}" +} + +output "docker_build_command" { + value = "docker build . --platform=linux/amd64 -f Dockerfile -t ${aws_ecrpublic_repository.this.repository_uri} -t ${aws_ecrpublic_repository.this.repository_uri}:$(date -u +%F) -t ${aws_ecrpublic_repository.this.repository_uri}:$(date -u +%s)" +} + +output "docker_push_command" { + value = "docker push --all-tags ${aws_ecrpublic_repository.this.repository_uri}" +} + +output "docker_run_command" { + value = "docker run --name wikipedia-on-ipfs --ulimit nofile=65536:65536 -d -p 4001:4001/tcp -p 4001:4001/udp ${aws_ecrpublic_repository.this.repository_uri}:latest " +} diff --git a/terraform/ecr/providers.tf b/terraform/ecr/providers.tf new file mode 100644 index 0000000..c125940 --- /dev/null +++ b/terraform/ecr/providers.tf @@ -0,0 +1,3 @@ +provider "aws" { + region = "us-east-1" +} diff --git a/terraform/ecr/resources.tf b/terraform/ecr/resources.tf new file mode 100644 index 0000000..58663b6 --- /dev/null +++ b/terraform/ecr/resources.tf @@ -0,0 +1,3 @@ +resource "aws_ecrpublic_repository" "this" { + repository_name = "distributed-wikipedia-mirror" +} diff --git a/terraform/ecr/terraform.tf b/terraform/ecr/terraform.tf new file mode 100644 index 0000000..4001c13 --- /dev/null +++ b/terraform/ecr/terraform.tf @@ -0,0 +1,9 @@ +terraform { + required_providers { + aws = { + version = "4.5.0" + } + } + + required_version = "~> 1.1.4" +} diff --git a/terraform/outputs.tf b/terraform/outputs.tf deleted file mode 100644 index 87fc147..0000000 --- a/terraform/outputs.tf +++ /dev/null @@ -1,3 +0,0 @@ -output "ssh_destination" { - value = "ubuntu@${aws_instance.wikipedia-on-ipfs.public_dns}" -} diff --git a/terraform/variables.tf b/terraform/variables.tf deleted file mode 100644 index ad86f7b..0000000 --- a/terraform/variables.tf +++ /dev/null @@ -1,11 +0,0 @@ -variable "public_key" { - description = "SSH public key." - type = string - sensitive = true -} - -variable "ami_owner_id" { - description = "AMI owner ID." - type = string - default = "642361402189" -} diff --git a/tools/add_website_to_ipfs.sh b/tools/add_website_to_ipfs.sh deleted file mode 100755 index 882933e..0000000 --- a/tools/add_website_to_ipfs.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -# This script adds website that was created at -# from to ipfs - -usage() { - echo "USAGE:" - echo " $0 []"; - echo "" - exit 2 -} - -if [ -z "${1-}" ]; then - echo "Missing main page name (eg. Main_Page.html) " - usage -fi - -if [ -z "${2-}" ]; then - echo "Missing unpacked zim dir (eg. ./out) " - usage -fi - -ZIM_FILE=$1 -TMP_DIRECTORY=$2 -EXTRA_FLAGS=${3:-} - -printf "\nAdding the processed tmp directory to IPFS\n(this part may take long time on a slow disk):\n" -CID=$(ipfs add -r --cid-version 1 --pin=false --offline -Q $EXTRA_FLAGS $TMP_DIRECTORY) -MFS_DIR="/${ZIM_FILE}__$(date +%F_%T)" - -# pin by adding to MFS under a meaningful name -ipfs files cp /ipfs/$CID "$MFS_DIR" - -printf "\n\n-------------------------\nD O N E !\n-------------------------\n" -printf "MFS: $MFS_DIR\n" -printf "CID: $CID" -printf "\n-------------------------\n" diff --git a/tools/docker_entrypoint.sh b/tools/docker_entrypoint.sh deleted file mode 100755 index ed243bb..0000000 --- a/tools/docker_entrypoint.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -pushd /github/distributed-wikipedia-mirror -./mirrorzim.sh "$@" "--push=false" -mkdir -p /github/workspace/snapshots -mkdir -p /github/workspace/tmp -mv snapshots/* /github/workspace/snapshots -mv tmp/* /github/workspace/tmp -popd diff --git a/tools/entrypoint.sh b/tools/entrypoint.sh new file mode 100755 index 0000000..c0cf62d --- /dev/null +++ b/tools/entrypoint.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +set -eu + +ipfs init -p server,flatfs --empty-repo +ipfs config --json Experimental.AcceleratedDHTClient true +ipfs config --json 'Datastore.Spec.mounts' "$(ipfs config 'Datastore.Spec.mounts' | jq -c '.[0].child.sync=false')" + +./mirrorzim.sh "$@" | ts + +ipfs daemon diff --git a/tools/publish_website_from_s3.sh b/tools/publish_website_from_s3.sh deleted file mode 100755 index 5ad4763..0000000 --- a/tools/publish_website_from_s3.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -# This scripts downloads from s3://wikipedia-on-ipfs, -# unpacks it and adds it to ipfs - -usage() { - echo "USAGE:" - echo " $0 "; - echo "" - exit 2 -} - -if [ -z "${1-}" ]; then - echo "Missing website name (eg. wikipedia_be_all_maxi_2022-03) " - usage -fi - -WEBSITE_NAME=$1 - -wget "https://wikipedia-on-ipfs.s3.eu-central-1.amazonaws.com/website-packages/${WEBSITE_NAME}.tar.gz" -tar -xzf "${WEBSITE_NAME}.tar.gz" -add_website_to_ipfs.sh "${WEBSITE_NAME}.zim" "${WEBSITE_NAME}" diff --git a/tools/start_ipfs.sh b/tools/start_ipfs.sh deleted file mode 100755 index fa77932..0000000 --- a/tools/start_ipfs.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -# This script starts ipfs daemon -# If ipfs was not initialised before, this script also initialises ipfs - -if ! ipfs repo stat; then - ipfs init -p server,local-discovery,flatfs,randomports --empty-repo - ipfs config --json Experimental.AcceleratedDHTClient true - ipfs config --json 'Datastore.Spec.mounts' "$(ipfs config 'Datastore.Spec.mounts' | jq -c '.[0].child.sync=false')" - ipfs config Addresses.Swarm '["/ip4/0.0.0.0/tcp/4001","/ip6/::/tcp/4001"]' --json -fi - -ipfs daemon