diff --git a/lib/docs/filters/coldfusion/clean_html.rb b/lib/docs/filters/coldfusion/clean_html.rb new file mode 100644 index 0000000000..b03863bc31 --- /dev/null +++ b/lib/docs/filters/coldfusion/clean_html.rb @@ -0,0 +1,85 @@ +module Docs + class Coldfusion + class CleanHtmlFilter < Filter + def call + # Listing/category pages (Tags, Functions, a category, or a guide index) + # use a different layout; keep their main container as-is after cleanup. + @doc = build_root + + # Remove site chrome and interactive widgets. + css('nav', 'footer', 'script', 'noscript', '#cfbreak', '.newsletter').remove + css('.modal', '.add-example-modal-lg', '.example-modal').remove + css('.example-btn', '.copy-btn', '.issuebutton', '.issuecount').remove + css('button').remove + + # Drop the "Add An Example" / edit / fork affordances. + css('a[href*="github.com"]', '#forkme', '#foundeo').remove + css('a.label.label-danger').remove # Edit links + + # Clean up the breadcrumb: keep the engine-version labels (they convey + # ColdFusion/Lucee/BoxLang availability) but drop navigation links and + # the issue tracker widget. + if (crumb = at_css('.breadcrumb')) + crumb.css('.label-warning').remove + crumb.css('.divider').remove + crumb.css('a[rel="nofollow"]').remove + # Remove navigation breadcrumb items (CFDocs > Functions > cf45 > …) + # that are not engine-availability labels. + crumb.css('li:not(.pull-right)').each do |li| + li.remove unless li.at_css('.label-acf, .label-lucee, .label-boxlang, .label-railo') + end + end + + # Code blocks: tag them so DevDocs applies CFML syntax highlighting. + css('pre.prettyprint', 'pre').each do |node| + node.remove_attribute('class') + node['data-language'] = 'coldfusion' + end + + # Inline code: nothing special needed, but strip prettyprint hints. + css('code').each { |node| node.remove_attribute('class') } + + # Remove now-empty wrappers left behind by the source template's many + # conditional blank lines. + css('div', 'p', 'span', 'ul', 'ol').each do |node| + node.remove if node.inner_html.strip.empty? && node.element_children.empty? + end + + doc + end + + # cfdocs splits an entry's content across the `.jumbotron` header (name, + # description, syntax), the `.breadcrumb`, and the main `.container` + # (arguments, compatibility, links, examples). Merge them into one root. + # + # NOTE: between filters the document is re-parsed as an HTML *fragment* + # (there is no
), so selectors must not depend on `body >`. + def build_root + # First .jumbotron is the page header; #cfbreak is the trailing + # newsletter jumbotron, which we ignore. + header = css('.jumbotron').reject { |n| n['id'] == 'cfbreak' } + .map { |n| n.at_css('.container') || n } + .first + breadcrumb = at_css('.breadcrumb') + + # The main content container holds the reference sections. It is a + # `.container` that is not the breadcrumb and not inside a jumbotron or + # nav. Identify it by the section headings it contains. + main = css('.container').find do |node| + next false if node.matches?('.breadcrumb') + next false if node.ancestors('.jumbotron').any? || node.ancestors('nav').any? + node.at_css('h2, .param, .panel') || node.at_css('#examples') + end + + root = Nokogiri::HTML.fragment('').at_css('div') + root << header.dup if header + root << breadcrumb.dup if breadcrumb + root << main.dup if main + + # Fall back to the full document/fragment if the expected structure is + # missing (e.g. some guide pages). + root.element_children.any? ? root : (at_css('body') || doc) + end + end + end +end diff --git a/lib/docs/filters/coldfusion/entries.rb b/lib/docs/filters/coldfusion/entries.rb new file mode 100644 index 0000000000..7cdcb4445b --- /dev/null +++ b/lib/docs/filters/coldfusion/entries.rb @@ -0,0 +1,66 @@ +module Docs + class Coldfusion + class EntriesFilter < Docs::EntriesFilter + # Category/listing slugs that aggregate other entries and must not appear + # as entries themselves. Categories generally end in "-functions" or + # "-tags", but list the fixed index pages explicitly. + INDEX_SLUGS = %w(index tags functions all).freeze + + def include_default_entry? + entry_page? + end + + def get_name + if (h1 = at_css('#docname')) + h1.content.strip + elsif (h1 = at_css('h1')) + # Guide pages: use the heading text without anchor noise. + h1.content.strip + else + super + end + end + + def get_type + return 'Guides' if guide_page? + + # Use the second breadcrumb link (Tags / Functions) as the category. + crumb = css('.breadcrumb a').map { |a| a.content.strip } + if crumb.include?('Tags') + 'Tags' + elsif crumb.include?('Functions') + 'Functions' + else + 'Guides' + end + end + + private + + # A real reference entry: a tag or function page. These have a `data-doc` + # whose value matches the slug (no spaces) and usually a `#syntax` block. + def entry_page? + return false if index_slug? + return true if guide_page? + + doc_name = at_css('[data-doc]').try(:[], 'data-doc') + return false if doc_name.nil? + # Category pages have human titles with spaces (e.g. "String Functions"). + !doc_name.include?(' ') + end + + def guide_page? + # Guides have no breadcrumb but do have content; they are neither tags + # nor functions nor category indexes. + return false if index_slug? + at_css('.breadcrumb').nil? && at_css('h1') + end + + def index_slug? + s = slug.to_s.downcase + return true if INDEX_SLUGS.include?(s) + s.end_with?('-functions', '-tags') + end + end + end +end diff --git a/lib/docs/scrapers/coldfusion.rb b/lib/docs/scrapers/coldfusion.rb new file mode 100644 index 0000000000..df6a0f861b --- /dev/null +++ b/lib/docs/scrapers/coldfusion.rb @@ -0,0 +1,58 @@ +module Docs + class Coldfusion < UrlScraper + self.name = 'ColdFusion' + self.slug = 'coldfusion' + self.type = 'simple' + self.base_url = 'https://cfdocs.org/' + self.root_path = 'index.cfm' + self.links = { + home: 'https://cfdocs.org/', + code: 'https://github.com/foundeo/cfdocs' + } + + html_filters.push 'coldfusion/entries', 'coldfusion/clean_html' + + options[:root_title] = 'ColdFusion' + + # cfdocs links categories with an encoded dash (e.g. /array%2Dfunctions); + # decode and clean those so entry paths look like /array-functions. + options[:decode_and_clean_paths] = true + + # cfdocs.org renders a page for every tag/function/guide at the site root, + # e.g. /hash or /cfhtmltopdf. Category "listing" pages (such as /tags, + # /functions and /array-functions) are crawled to discover entries, but the + # Entries filter excludes them from the index. + # + # Skip site chrome, utilities, reports and other non-reference pages. + options[:skip] = %w( + 404.cfm contributors.cfm trycf.cfm ucase.cfm llms.cfm + how-to-contribute opensearch.xml robots.txt) + + options[:skip_patterns] = [ + /\Aassets\b/, + /\Areports\b/, + /\Autilities\b/, + /\Aslack\b/, + /openimage/, + /\.json\z/, + /\.png\z/, + /\.ico\z/, + /\.xml\z/, + /\.css\z/, + /\.js\z/ + ] + + options[:attribution] = <<-HTML + © 2012–present Foundeo, Inc. and the CFDocs contributors.