BlogBuilder.py

"""
This file would sit in the directory hosting your www directory. By default, it reads text files from "toproc" directory, then converts to HTML.
"""

import glob
import os
from lxml import html

############################
# New Article HTML Builder #
############################

def generateHtml(document):
    '''
    This function generates our new article's HTML code, based on the input document,
    Input document is basic html markup, with some tag information added.
    '''
    content = open(document, 'r').read()

    # Create intermediate list of sentences
    intermediate = content.split('\n')
    # Set some default values
    meta_tag = "My Blog"
    title_tag = "My Blog Default"
    date_tag = ""

    # Start iterating over the sentences to pull out any tags
    # It is assumed that for each of the below tags ("METATAGS:", "TITLE:", "DATE:"), that they will each be on their own line,
    # with one line of whitespace seperating them from each other. 
    # See example input article in the repo.
    #
    for sentence in intermediate:
        print(sentence)
        print(type(sentence))
        if "METATAGS:" in sentence:
            meta_tag = sentence.split(':')[1].strip().title()
            print("+++++++++++++++++++++++++++++++=============================++++++++++++++++++++++++++++++",sentence)
            intermediate.remove(sentence)
            continue
        elif "TITLE:" in sentence:
            title_tag = sentence.split(':')[1].strip().title()
            intermediate.remove(sentence)
            continue
        elif "DATE:" in sentence:
            date_tag = sentence.split(':')[1].strip()
            intermediate.remove(sentence)
            continue
        else:
            continue
    #
    # Tags which were found above will be inserted into the header below (replacing the '{}', using the format() string method)
    #
    head = """<!DOCTYPE html>
<!-- This code was generated by BlogBuilder.py, which was written by Max Lee of Maxya IT. Email me to learn more. -->
<html>
  <head>
    <meta charset="utf-8">
    <meta name="viewport" content="width=device-width">
    <meta name="description" content="{}">
    <meta type='tags' content='{}'>
    <title>Your Blog Title{}</title>
    <meta name="author" content="WHatever Your Name Is">
    <link rel="stylesheet" href="css/style.css">
  </head>\n""".format(meta_tag, meta_tag, title_tag)
  
    #
    # Put the body text back together, starting with the head.
    # Notice the div with id "branding", which you can style using CSS.
    #
    body = head
    body += """    <header>
      <div class="container">
        <a href='http://www.yoursitename.com'>
        <div id="branding">
          <img src="./img/sitelogo.png" alt="Site Logo here">
          <h1>Site name here</h1>
        </div>
        </a>
        <nav>
          <ul>
            <li class="current"><a href="http://yoursitename.com/blog/index.html">Blog Home</a></li>
          </ul>
        </nav>
      </div>
    </header>"""
    
    #
    # Create the HTML body by appending the head (above) with the input document's sentences (below)
    #
    
    body += "<body><div id='date'>Article date: {}</div>\n<div class='article_body'>\n".format(date_tag)
    body += "\n".join(intermediate)
    body += "</div>"
    
    #
    # Now we create the footer. As an example, this is where I keep my analytics JS and copyright notice. Y
    # You'll definately want to remove/change that to fit your setup.
    # This also has the class "next_article_menu", which gets updated automatically as you add the next article later.
    #
    
    footer = """
    <footer>
    <ul class="next_article_menu">
        <li><a href="http://yoursitename.com/blog/index.html">Blog Home</a></li>
    </ul>
      <p>Your company/site name, Copyright 2018</p>
      <!-- Global site tag (gtag.js) - Google Analytics -->
        <script async src="https://www.googletagmanager.com/gtag/js?id=UA-1116123414af8253143123f3422-1"></script>
        <script>
          window.dataLayer = wisndow.dataLayer || [];
          functsion gtag(){dataLayer.pussh(arguments);}
          gtasg('js', new Dsate());
          gtasg('config', 'UAs-11923452-2311');
        </script>
    </footer>
  </body>
</html>"""
    body += "\n"+footer
    # Do a quick check to make sure your intended title tag was picked up. Exit if not.
    if title_tag != "My Blog Default":
        filename = title_tag.replace(" ", "-").strip().strip('.').strip('?').strip('!')
        with open('www/blog/{}.html'.format(filename), 'w') as f:
            f.write(body)
        return filename, filename+".html"
    else:
        print("No title was set. Exiting.")
        exit(1)
        
######################
# Blog Index updater #
######################

def updateIndex(output_url, title_tag):
    import datetime
    #
    # Assumes directory structure of "www" ---> "blog" ---> "index.html", which holds the index list of all blog articles.
    #
    indexfile = open('www/blog/index.html', 'r')
    indexcontent = indexfile.read()
    print(indexcontent)
    doc = html.document_fromstring(indexcontent)
    today = datetime.date.today().isoformat()

    # Add list item to the blog bloglist
    title_tag = title_tag.replace('-', ' ')
    new_item = html.fragment_fromstring('          <li>{} :: <a href = "{}">{}</a></li>'.format(today, output_url, title_tag))
    ##print(new_item)
    listlength = len(doc.xpath("//div[@class='container']/ul/li"))
    ##print("listlength: ", listlength)
    doc.xpath("//div[@class='container']/ul/li")[listlength-1].addnext(new_item)

    print("length of list after adding article:  ", len(doc.xpath("//div[@class='container']/ul/li")))
    # Test that the addition was successful...
    if len(doc.xpath("//div[@class='container']/ul/li")) != (listlength + 1):
        print("Something went wrong when trying to add new article to the index. Exiting...")
        exit(1)

    # Replace date string
    today = datetime.datetime.today()
    #
    # I'm located in U.S. Mountain time zone... You'll want to update below to fit you situation accordingly.
    #
    date_string = 'Last updated on {:%A, %B %d %Y, at %I:%M %p, U.S. Mountain Time}'.format(today)
    doc.xpath("//div[@id='update_string']")[0].text = date_string

    # Convert to doc to string
    outfile = html.tostring(doc, pretty_print=True, encoding='unicode')
    with open('www/blog/index.html', 'w') as f:
        f.write(outfile)
    previous_blog_index = listlength - 1

    ##print(doc.xpath("//div[@class='container']/ul/li"))
    # Get URL for previous blog.
    previous_blog = doc.xpath("//div[@class='container']/ul/li/a")[previous_blog_index].attrib.items()[0][1]

    return previous_blog

############################
# Previous Article Updater #
############################

def updatePreviousHtml(previous_blog, output_url, title):
    '''
    This function receives the URL for the most recent prior blog, along with the currently created blog article.
    It then opens up the html of the previous, updates its footer with our most recent article's URL, enabling
    the automatic linking of newly created articles. 
    TODO: Add a "previous article" feature. Right now progress is only forward; there's not option 
    (other than the back button in the browser) to go to previous article.
    '''
    # Open previous blog and read its contents.
    unchanged_string = ""
    with open("www/blog/{}".format(previous_blog), 'r') as f:
        unchanged_string = f.read()

    # convert to an lxml tree structure for further manipulation.
    parsed = html.document_fromstring(unchanged_string)

    # extract out the menu class from the footer; returns list
    menu_section = parsed.xpath("//ul[@class='next_article_menu']/li")

    # get length of list. Helpful when referencing index and checking status.
    length = len(menu_section)

    # Create fragment for insertion
    newlink = html.fragment_fromstring("        <li><a href='http://yoursitenamehere.com/blog/{}'>Next: {}</a></li>".format(output_url, title))
    print("newlink text: ", newlink.text)

    # Insert fragment
    print("pre-add length: ", len(parsed.xpath("//ul[@class='next_article_menu']/li")))
    menu_section[length-1].addnext(newlink)
    print("post-add length: ", len(parsed.xpath("//ul[@class='next_article_menu']/li")))

    # Check that insert was successful
    if (length + 1) != len(parsed.xpath("//ul[@class='next_article_menu']/li")):
        print("Something when wrong when linking previous article to newest article. Exiting...")
        exit(1)

    # Convert to string before writing out
    outfile = html.tostring(parsed, pretty_print=True, encoding='unicode')

    # Add link to next article
    with open("www/blog/{}".format(previous_blog), 'w') as f:
        f.write(outfile)
    return True

########################
# Main Program Section #
########################

inputdir = 'toproc'

# Find files to be processed. Returns a list.
to_proc = glob.glob('{}/*.txt'.format(inputdir))

# Most of the time, this will have a single file to process, but just in case...
for document in to_proc:
    title, output_url = generateHtml(document)
    previous_blog = updateIndex(output_url, title)

    updatePreviousHtml(previous_blog, output_url, title)

print("Completed processing article. Please rsync blog directory with server.")

# Now cleanup our processed input articles...
print("Cleaning up input files")
# Get list of all files
to_del = glob.glob('{}/*'.format(inputdir))
# then 'unlink' (delete) them.
for document in to_proc:
    os.unlink(document)

print("Finished cleaning.")
print("Exiting with success status.")
exit(0)