Navigation

    The Onewheel Forum

    Onewheel Forum

    • Register
    • Login
    • Search
    • Categories
    • Recent
    • Popular
    • Users
    • Groups
    • Rules
    • Archive

    Organising the Archive

    The Archive
    archive old forum
    8
    121
    8135
    Loading More Posts
    • Oldest to Newest
    • Newest to Oldest
    • Most Votes
    Reply
    • Reply as topic
    Log in to reply
    This topic has been deleted. Only users with topic management privileges can see it.
    • Lia
      Lia GT XR Pint Plus V1 DIY last edited by Lia

      That works :D
      Made a few tweaks that seemed to take fine.

      • Changed the button text from <a> to <span> else it generates green link text because of the css which could be hard to read on the blue button. Tried to override it with some style but gave up and made it span instead :)
      • Added <span>&nbsp;</span> after the </button> so it creates a little space between itself and the voting bit.
      • Minor grammatical update to the missing post bit.

      Full modified script below:

      #!/usr/bin/perl
      
      =head1 NAME
      
      forum-archive - Put a google chached community.onewheel.com thread back together
      
      =cut
      
      use IO::Handle;
      use HTTP::Request;
      use LWP::UserAgent;
      use File::Copy;
      use File::Path;
      use File::Find;
      use Pod::Usage;
      use POSIX;
      
      my($HEADER, @POSTS, $FOOTER, $COUNT, $WEIRD);
      my(%META)=(
      	'base'		=> 'https://archive.owforum.co.uk/',
      	'logo'		=> 'http://archive.owforum.co.uk/Images/OWForumArchive.png',
      	'logo_ht'		=> '60',
      	'icon'		=> 'https://archive.owforum.co.uk/assets/resources/OWForumArchiveIcon.png',
      	'profiles'	=> '../../../assets/uploads/profile',
      	'resources'	=> '../../../assets/resources',
      	'system'		=> '../../../assets/uploads/system',
      );
      
      my(%RESOURCES)=(
      	'fonts'		=> 'https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.4/css/all.min.css',
      	'style'		=> 'https://owforum.co.uk/assets/client-darkly.css',
      	'broken'		=> 'https://icon-icons.com/downloadimage.php?id=5390&root=39/PNG/128/&file=brokenfile_5952.png',
      );
      
      my(%TOPICS)=();
      my(%SEARCH_PATH)=();
      
      =head1 SYNOPSIS
      
      forum_archive <directory1> [directory2] [directory3] [...]
      
      =head1 DESCRIPTION
      
      This script takes a set of files downloaded from the google page cache
      for the community.onewheel.com NodeBB forum and tries to put it back
      together.
      
      =head2 Content Management
      
      A small number of resources are available from the internet.  B<forum_archive>
      will download these assets, if needed, for inclusion to the archive file
      structure.  If assets have been downloaded recently (within the last day)
      then new downloads are not attempted.  This should keep B<forum_archive> from
      slamming remote resources during testing phases.
      
      =cut
      
      sub wget {
      	my($src, $asset, $dst)=@_;
      	my($file)=IO::Handle->new;
      	my($req)=HTTP::Request->new( 'GET' => $src );
      	my($get)=LWP::UserAgent->new;
      	my($response);
      
      	$asset=~s|^[/.]+||;
      	
      	if(!-s "$asset/$dst" || -M "$asset/$dst" > 1 ) {
      		$response=$get->request($req);
      		if($response->is_success) {
      			File::Path::make_path($asset, { 'chmod' => 0755 });
      			open($file, '>', "$asset/$dst");
      			print $file $response->decoded_content;
      			close($file);
      		}
      	}
      }
      
      =pod
      
      B<forum_archive> will dynamically create the C<assets> and C<topic>
      directory structures as needed to store content found within the
      post.  In an effort to increase efficiency for commonly used content,
      such as avatars, actual copying of these files will not occur each time
      such content is seen, after its initial copy.
      
      =cut
      
      sub location {
      	my($file)=@_;
      	my($try);
      
      	if(-r $file) {
      		return($file);
      	} else {
      		foreach my $dir (keys(%SEARCH_PATH)) {
      			$try=join('/', $dir, $file);
      			return($try) if(-r $try);
      		}
      	}
      
      	&wget($RESOURCES{'broken'}, $META{'resources'}, 'broken-file.png');
      	return(join('/', $META{'resources'}, 'broken-file.png'));
      }
      
      sub copy {
      	my($src, $asset, $dst)=@_;
      
      	$asset=~s|^[/.]+||;
      
      	if(!-s "$asset/$dst" || -M "$asset/$dst" > 1 ) {
      		File::Path::make_path($asset, { 'chmod' => 0755 });
      		File::Copy::copy(&location($src), "$asset/$dst");
      	}
      	
      }
      
      =pod
      
      Avatar image are stored in a central location, shared by the entire
      archive.  This lowers the space requirements of the archive and
      increases page load times and browser cache efficiency.
      
      =cut
      
      sub avatar {
      	my($img)=@_;
      	my($dst)=$img; $dst=~s|^.*/||;
      
      	&copy($img, $META{'profiles'}, $dst);
      
      	return(join('/', $META{'profiles'}, $dst));
      }
      
      =pod
      
      Uploaded images which are stored in the archive may be named slightly
      differently on the archive than on the original.  NodeBB has gone through
      a couple iterations about how to handle this conflict, and B<forum_archive>
      tries to handle this by using the more unique C<ALT> tag element parameter
      name.  When that doesn't work, the original name is kept.  Images are also
      grouped by post, to avoid naming conflicts between different posts.
      
      Additionally, if an image is referenced in a post, but is not contained in
      the archive, a standard broken file image is substituded.
      
      =cut
      
      sub upload {
      	my($src, $alt)=@_;
      	my($new)=$src; $new=~s|^.*/||;
      
      	if($alt=~m/\.\w+$/) {
      		$new=join('_', $META{'postid'}, $alt);
      	} else {
      		$new=join('_', $META{'postid'}, $new);
      	}
      
      	&copy($src, $META{'path'}, $new);
      	return(sprintf('<img src="%s" alt="%s"', $new, $alt));
      	
      }
      
      =head2 Archive Display
      
      One major change in the archive from the original is the banner.  The original
      banner is replaced by one tailored to the archive, to set it apart from
      the original forum and make it clear it is a wholly different entity.
      
      =cut
      
      sub banner {
      	my($start, $img, $end)=@_;
      	return($start.qq{
      		<div class="container">
      		  <div class="navbar-header">
      		    <a href="http://archive.owforum.co.uk">
      		      <img alt="The Archive homepage" src="$META{'logo'}" height="$META{'logo_ht'}">
      		    </a>
      		  </div>
      		  <div class="navbar-header pull-right">
      		    <p class="text-right" style="padding-top: 10px">
      		      This page is an archived copy of the old Onewheel Forum.
      		    </p>
      		  </div>
      		</div>
      	}.$end);
      }
      
      =pod
      
      One of the differences which makes the archive different is, unfortunately,
      that some posts are missing.  When this occurs, B<forum_archive> inserts a
      break in the timeline with a note about the message IDs which are absent.
      
      =cut
      
      sub missing_post {
      	my($id)=@_;
      
      	return(qq{
      	  <li component="topic/necro-post" class=" necro-post timeline-event" data-index="$id">
      	    <small class="timeline-text">
                 <span>Post(s) $id are missing from the archive :(</span><br />
      		 <span>
                   Know where these posts are?  Visit
                   <a href="https://owforum.co.uk/topic/158/missing-posts">the new forum</a>
                   for how to help get them added :)
      		 </span>
               </small>
      	  </li>
      	});
      }
      
      =pod
      
      Although most of this script works to remove unnecessary content from archive
      posts, one little thing is added.  A button is added to help people copy
      a posts permalink to their clipboard to facilitiate sharing.
      
      =cut
      
      sub share_fn {
      	return(qq{
            <script>
              function shareButton(index) {
                navigator.clipboard.writeText("$META{'url'}#"+index);
      
                var tooltip = document.getElementById("shareTooltip"+index);
                tooltip.innerHTML = "Stoke Saved To Clipboard!";
                setTimeout(function() {
                  tooltip.innerHTML = "&nbsp; Share This Post! &nbsp;";
                }, 3000);
              }
            </script>
      	});
      }
      
      sub share_btn {
      	return(qq{
              <button onclick="shareButton($META{'postid'})" class="btn btn-sm btn-primary">
      	        <span class="tooltiptext" onclick="shareButton($META{'postid'});event.preventDefault();" id="shareTooltip$META{'postid'}" href="$META{'url'}#$META{'postid'}">&nbsp; Share This Post! &nbsp;</span>
              </button>
      	<span>&nbsp;</span>
      	});
      }
      
      =pod
      
      An interactive, HTML5 based NodeBB forum requires a lot of javascript to
      work.  Since the archive is a static copy of that data, all of the javascript
      is removed, and the archive works nearly identically on all platforms.
      
      =cut
      
      sub global {
      	s|https?://community.onewheel.com/|$META{'base'}|sg;
      
      	s|<noscript>.*?</noscript>||sg;
      	s|<script>.*?</script>||sg;
      	s|<script .*?></script>||sg;
      
      	s|\s+<div component="topic/reply/container" .*?</div>||s;
      	s|\s+<a component="topic/reply/guest" .*?</a>||m;
      
      	s|class="posts"|class="posts timeline"|mg;
      	s|\n\n<hr>\n||sg;
      	if(m|<span component="topic/post-count".*?>(\d+)</span>|m) {
      		$COUNT=$1;
      	}
      }
      
      =pod
      
      B<forum_archive> assumes that all the headers from all the source files
      are identical, and uses the first one it finds.  With that content,
      the new banner is inserted, interactive metadata and buttons are removed,
      and the new style is setup. B<forum_archive> also collects important
      information like the page path and total message count.
      
      =cut
      
      sub header {
      	local($_)=@_;
      
      	#Cleanup to a reasonable starting header only
      	s/(<ul component="topic" class="posts timeline" .*?>\s+).*$/$1/s;
      
      	s/(<body .*?>).*$/$1/m;
      
      	#Grab some important info
      	if(m|<link rel="canonical" href="($META{'base'}(.*?))">|) {
      		$META{'url'}=$1;
      		$META{'path'}=$2;
      	}
      
      	#reset links
      	#strip out unneeded content
      	s|(<meta property="og:url" content=".*?)/\d+\?.*?">|$1">|mg;
      
      	s|\s+<meta name="msapplication-\w+" .*?>||sg;
      
      	s|\s+<link rel="icon" sizes=.*?>||sg;
      	s|\s+<link rel="prefetch" .*?>||sg;
      	s|\s+<link rel="prefetch stylesheet" .*?>||sg;
      	s|\s+<link rel="manifest" .*?>||sg;
      	s|\s+<link rel="search" .*?>||sg;
      	s|\s+<link rel="apple-touch-icon" .*?>||sg;
      	s|\s+<link rel="alternate" .*?>||sg;
      	s|\s+<link rel="next" .*?>||sg;
      	s|\s+<link rel="prev" .*?>||sg;
      
      	s|(<link rel="icon" type="image/x-icon" href=").*?">|$1$META{'icon'}">|mg;
      
      	&wget($RESOURCES{'style'}, $META{'resources'}, 'client-darkly.css');
      	s|<link rel="stylesheet" .*?>|<link rel="stylesheet" href="$META{'resources'}/client-darkly.css">\n\t<link rel="stylesheet" href="$RESOURCES{'fonts'}">|s;
      
      	if(m|forum-logo" src="(.*?/site-logo.png)"|m) {
      		&copy($1, $META{'system'}, 'site-logo.png');
      		s|forum-logo" src=".*?"|forum-logo" src="$META{'system'}/site-logo.png"|mg;
      	}
      
      	s|(<h1 component="post/header" .*?)>|$1 style="padding-top: 50px;">|m;
      
      	s|\s+<section class="menu-section".*?</section>||s;
      
      	#Insert new banner
      	s|(<nav class="navbar navbar-default navbar-fixed-top header".*?>).*?<img alt="Onewheel Home Page" class=" forum-logo" src="(.*?)">.*?</nav>|&banner($1, $2, '</nav>')|se;
      
      	#Remove unnecessary buttons
      	s|\s+<a class="hidden-xs" target="_blank".*rss.*</a>||mg;
      	s|\s+<div title="Sort by" .*?</div>||s;
      	s|<li>[^RL]+<span>Register</span>.*?</li>||gs;
      	s|<li>[^RL]+<span>Login</span>.*?</li>||gs;
      	s|<a component="topic/reply/guest" .*?</a>\s*||s;
      	s|<ol class="breadcrumb">.*?</ol>||s;
      
      	s|<span class="hidden-xs">Loading More Posts</span> <i .*?</i>||mg;
      
      	s|class="slideout-panel" style=".*?"|class="slideout-panel"|m;
      
      	#s|<!--<base href=.*$||m;
      	s|</style></head>.*$|</style>|m;
      	s|\s+(<nav id="menu")|\n</head><body>\n$1|s;
      	s|</head>|&share_fn.'</head>'|es;
      
      	return($_);
      }
      
      =pod
      
      A lot of cleanup occurs within each forum post.  Firstly, and with the
      javascript removed, all times are calculated and coded directly in UTC.
      Interactive buttons are removed, and links to content (such as user
      pages) not contained in the archive are also removed.  Other interactive
      content (e.g. online status) is removed, too.
      
      Media, such as avatars and uploaded content is collected and placed
      properly into the new archive filesystem structures.
      
      =cut
      
      my(@MONTH)=qw(
      	January February March April May June July
      	August September October November December
      );
      sub utctime {
      	my($epoch)=int($_[0]/1000);
      	my($sec, $min, $hr, $day, $month, $year, $wd, $jd, $dst)=gmtime($epoch);
      
      	return(sprintf("%d %s %d, %02d:%02d UTC",
      		$day, $MONTH[$month], $year+1900, $hr, $min));
      }
      
      sub post {
      	local($_)=@_;
      	my($time);
      	
      	if(m/data-timestamp="(\d+)"/s) {
      		#$time=POSIX::strftime("%e %B %Y, %H:%M UTC", gmtime($1/1000));
      		$time=&utctime($1);
      		s|(><span class="timeago") title="(.+?)">|$1 title="$time" datetime="$2">$time|sg;
      	}
      
      	s|<span class="replies-last .*</span>||mg;
      	s|<a component="post/parent" .*?>(.*?)</a>|$1|mg;
      	s|<i component="user/status" .*?></i>||mg;
      	s|<a href=".*?/user/.*?">(.*?)</a>|<span class="btn-link">$1</span>|sg;
      	s|<a class="plugin-mentions-user .*?>(.*?)</a>|<span class="btn-link">$1</span>|mg;
      	s|<a href="[^"]+/user/.*?">\s+(<span class="avatar.*?>)\s+</a>|$1|sg;
      	s|(?<= component="user/picture" src=")([^"]+)|&avatar($1)|meg;
      	s|(?<= component="avatar/picture" src=")([^"]+)|&avatar($1)|meg;
      	s|<img src="(.*?)" alt="(.*?)"(?= \s*class="\s*img-responsive)|&upload($1, $2)|meg;
      	s|<a (component="post/reply-count".*? href=").*?/(\d+)[?#].*?(".*?)>|<a $1#$2$3>|mg;
      	s|\s+<i component="post/edit-indicator".*?</i>||mg;
      	s|\s+<i class="fa fa-fw fa-chevron-right".*?</i>||mg;
      	s|\s+<i class="fa fa-fw fa-chevron-down hidden".*?</i>||mg;
      	s|\s+<i class="fa fa-fw fa-spin fa-spinner hidden".*?</i>||mg;
      	s|\s+<small class="pull-right">\s+<span class="bookmarked">.*?</span>\s+</small>||sg;
      
      	s|(?<= class="avatar" src=")([^"]+)|&avatar($1)|meg;
      	s|(component="user/picture" data-uid="\d+" src=")([^"]+)|$1.&avatar($2)|meg;
      	s|(<img component="user/picture")|$1 class="avatar  avatar-sm2x avatar-rounded"|mg;
      	s|(data-uid="\d+") class="user-icon"|$1 class="avatar  avatar-sm2x avatar-rounded"|mg;
      	s|(title="\w+") class="user-icon"|$1 class="avatar  avatar-xs avatar-rounded"|mg;
      	s|id="[^"]*google-cache-hdr"||sg;
      	s|This is Google's cache of||sg;
      
      	s|<a class="permalink" href=".*?">(.*?)</a>|<span class="text-muted pull-right">$1</span>|mg;
      	s|\s+<span component="post/tools".*?</span>||sg;
      	s|<a component="post/\w+vote" .*?>\s+(.*?)</a>\s+|$1|sg;
      	s|<span class="votes">|<span class="votes text-muted">|mg;
      
      	s|\s+<span>\s+</span>||sg;
      	s|\s+<span class="visible-xs-inline-block [^>]+>\s+</span>||sg;
      	s|<small data-editor="[^"]*" .*?</small>\s+||sg;
      	s|\s+<span class="bookmarked"><i class="fa fa-bookmark-o"></i></span>||sg;
      	s|(<span class="visible-xs-inline-block[^>]+>)(\s+<span class="text-muted pull-right"><span.*?</span></span>)(.*?</span>)|$1$3</small>\n<small class="pull-right">$2|sg;
      
      	s|<span class="post-tools">|'<span class="post-tools">'.&share_btn|es;
      
      	return($_);
      }
      
      =pod
      
      Similarly to the header, the HTML after all the posts is based on the first
      file seen and removes some of the content better suited to an interactive
      stite than a static, archive site.
      
      =cut
      
      sub footer {
      	local($_)=@_;
      
      	s|<div class="progress-bar"></div>||s;
      	s|<div class="spinner" role="spinner"><div .*?</div></div>||s;
      	s|<div id="nprogress">.*?</div></div></div>||s;
      
      	return($_);
      }
      
      =head2 Data Import Process
      
      Each downloaded F<.html> file from the forum is read and separated into
      3 sections, a header, a list of posts, and a footer.  The first file's
      header will be processed and used as the archive files header, same with
      the footer.
      
      Each post is pulled into an array.  If a post occurs in multiple downloaded
      cache files, then the last one read is kept.  Each one is processed and
      prepared for the final archive topic.
      
      =cut
      
      sub ingest {
      	my($source)=@_;
      	my($html)=IO::Handle->new;
      	my($move, $category_url, $category);
      	local($/)="\n\t\t\t\t</li>\n\t\t\t";
      
      	open($html, '<', $source);
      	while(<$html>) {
      		&global;
      
      		if(m/^<!DOCTYPE html>/) {
      			if(m/class="nprogress-busy"/) {
      				$WEIRD=0;
      			} else {
      				$WEIRD=1;
      			}
      
      			if(!$HEADER || !$WEIRD) {
      				$HEADER=&header($_);
      			}
      			s/^.*<ul component="topic" class="posts timeline" .*?>\s+\n//s;
      		}
      
      		if(m|</html>$|) {
      			if(!$FOOTER || !$WEIRD) {
      				$FOOTER=&footer($_);
      
      				if($FOOTER=~s|(<div class="post-bar">.*\n<hr>\n\t\t</div>)||s) {
      					$move=$1;
      					($category)=($HEADER=~m|<meta property="article:section" content="(.*?)">|m);
      					($category_url)=($HEADER=~m|<link rel="up" href="(.*?)">|m);
      					$category=~s/&amp;/&/g;
      
      					$HEADER=~s|</h1>\n|</h1>\n$move|s;
      					$HEADER=~s|<div class="tags pull-left">.*<div class="topic-main-buttons pull-right">|<div class="topic-main-buttons pull-left"><a href="$category_url">$category</a>|s;
      
      					$HEADER=~s|class="stats hidden-xs"|class="stats text-muted"|mg;
      					$HEADER=~s|(<span component="topic/post-count" class="human-readable-number" title="\d+">\d+</span>)<br>\s+<small>Posts</small>|<i class="fa fa-fw fa-pencil" title="Posts"></i>$1|s;
      					$HEADER=~s|(<span class="human-readable-number" title="\d+">\d+</span>)<br>\s+<small>Views</small>|<i class="fa fa-fw fa-eye" title="Views"></i>$1|s;
      				}
      
      			}
      			last;
      		}
      
      		if(m/data-index="(\d+)"/) {
      			$META{'postid'}=$1;
      			$POSTS[$META{'postid'}]=&post($_);
      		}
      	}
      	close($html);
      }
      
      =head2 Execution
      
      The script expects a directory structure of HTML files which have valid links
      to media files.  Other than that, it is pretty agnostic about the structure
      of the directory.  It will read the header to find out what the name of the
      document should be, create it, and write to it.
      
      In addition to processing archived posts, a special post is inserted for
      anything missing.  B<forum_archive> will also produce a report on F<STDOUT>
      with information on missing posts.
      
      =cut
      
      sub process {
      	my($html)=IO::Handle->new;
      	my($posts, $total)=(0, 0);
      	my(@missing)=();
      
      	$HEADER="";
      	@POSTS=();
      	$FOOTER="";
      	$COUNT=0;
      
      	foreach my $entry (@_) {
      		&ingest($entry);
      	}
      
      	for(my $i=0; $i<$COUNT; $i++) {
      		if(!exists($POSTS[$i])) {
      			my($begin);
      
      			for($begin=$i; !exists($POSTS[$i+1]) && $i<$COUNT; $i++) {
      				$posts++;
      				$total++;
      			}
      
      			if($i==$begin) {
      				$POSTS[$i]=&missing_post($i);
      			} else {
      				$POSTS[$i]=&missing_post("$begin-$i");
      				push(@missing, "$begin-$i");
      			}
      
      			$posts++;
      		}
      		$total++;
      	}
      
      	if($total) {
      		printf("%s, Total: %d, Coverage: %d%%, Missing: %s\n", $META{'path'},
      			$total, (1-$posts/$total)*100, join(' ', @missing) || 'None');
      	} else {
      		printf("%s, Total: %d\n", $META{'path'}, $total);
      	}
      
      	File::Path::make_path($META{'path'}, { 'chmod' => 0755 });
      	open($html, '>', join('/', $META{'path'}, 'index.html'));
      	print $html $HEADER;
      	print $html @POSTS;
      	print $html $FOOTER;
      	close($html);
      }
      
      if($ARGV[0] =~ m/^-+h/i) {
      	pod2usage(-verbose => 2, -exitval => 0);
      } elsif(! -d $ARGV[0]) {
      	pod2usage(-verbose => 1, -exitval => 0);
      }
      
      find(sub {
      	$File::Find::prune=1 if(m/^assets$/);
      	$File::Find::prune=1 if(m/^topic$/);
      
      	if(m/(\d+)\s+.*\.html$/) {
      		push(@{$TOPICS{$1}}, $File::Find::name);
      		$SEARCH_PATH{$File::Find::dir}=1;
      	}
      }, @ARGV);
      
      foreach my $topic (sort({ $a <=> $b }  keys(%TOPICS))) {
      	&process(sort(@{$TOPICS{$topic}}));
      }
      
      
      =head1 NOTES
      
      B<forum_archive> is basically a conglomeration of regular expressions.  This
      is by no means the best way to manage and manipulate complext HTML files.
      However, given the static nature of this content and its relative complexity,
      using regular expressions requires a substantially smaller code base and
      interpretation of the original source files.  Essentially, in this case,
      it is too much easier to strip out the junk you know you don't want than
      to understand the entire document schema fully enough to make the meaningful
      changes the right way.
      
      =pod
      


      Had too adjust the css and start migrating assets off the old format so if pages load funny force a cache refresh by pressing CTRL+F5.


      Also... made a custom 404 page ;)
      Enjoy~

      1 Reply Last reply Reply Quote 0
      • First post
        Last post