Skip to content

Commit

Permalink
Adds ProcessHtml module
Browse files Browse the repository at this point in the history
  • Loading branch information
daveaglick committed Feb 10, 2020
1 parent e219731 commit 4f10aa4
Show file tree
Hide file tree
Showing 4 changed files with 346 additions and 42 deletions.
1 change: 1 addition & 0 deletions RELEASE.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# 1.0.0-alpha.24

- Added new `ProcessHtml` module for more flexible processing of DOM nodes.
- Added new `IEnumerable<IDocument>.Flatten()` extension to flatten document trees.
- Added new `IEnumerable<IDocument>.FilterSources()` and `IEnumerable<IDocument>.FilterDestinations()` extension methods.
- Added a new `FilterDestinations` module to filter documents by destination path.
Expand Down
48 changes: 6 additions & 42 deletions src/extensions/Statiq.Html/InsertHtml.cs
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,6 @@ namespace Statiq.Html
/// <category>Content</category>
public class InsertHtml : ParallelModule
{
private static readonly HtmlParser HtmlParser = new HtmlParser();

private readonly string _querySelector;
private readonly Config<string> _content;
private bool _first;
Expand Down Expand Up @@ -74,46 +72,12 @@ public InsertHtml AtPosition(AdjacentPosition position = AdjacentPosition.Before
return input.Yield();
}

// Parse the HTML content
IHtmlDocument htmlDocument = await input.ParseHtmlAsync(context, HtmlParser);
if (htmlDocument == null)
{
return input.Yield();
}

// Evaluate the query selector
try
{
if (!string.IsNullOrWhiteSpace(_querySelector))
{
IElement[] elements = _first
? new[] { htmlDocument.QuerySelector(_querySelector) }
: htmlDocument.QuerySelectorAll(_querySelector).ToArray();
if (elements.Length > 0 && elements[0] != null)
{
foreach (IElement element in elements)
{
element.Insert(_position, content);
}

using (Stream contentStream = await context.GetContentStreamAsync())
{
using (StreamWriter writer = contentStream.GetWriter())
{
htmlDocument.ToHtml(writer, ProcessingInstructionFormatter.Instance);
writer.Flush();
return input.Clone(context.GetContentProvider(contentStream, MediaTypes.Html)).Yield();
}
}
}
}
return input.Yield();
}
catch (Exception ex)
{
context.LogWarning("Exception while processing HTML for {0}: {1}", input.ToSafeDisplayString(), ex.Message);
return input.Yield();
}
return await ProcessHtml.ProcessElementsAsync(
input,
context,
_querySelector,
_first,
(i, c, e, m) => e.Insert(_position, content));
}
}
}
165 changes: 165 additions & 0 deletions src/extensions/Statiq.Html/ProcessHtml.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Threading.Tasks;
using AngleSharp.Dom;
using AngleSharp.Dom.Html;
using AngleSharp.Parser.Html;
using Microsoft.Extensions.Logging;
using Statiq.Common;

namespace Statiq.Html
{
/// <summary>
/// Queries HTML content of the input documents and modifies the elements that
/// match a query selector.
/// </summary>
/// <remarks>
/// <para>
/// Note that because this module parses the document
/// content as standards-compliant HTML and outputs the formatted post-parsed DOM, you should
/// only place this module after all other template processing has been performed.
/// </para>
/// </remarks>
/// <category>Content</category>
public class ProcessHtml : ParallelModule
{
private static readonly HtmlParser HtmlParser = new HtmlParser();

private readonly string _querySelector;
private readonly Action<Common.IDocument, IExecutionContext, IElement, Dictionary<string, object>> _processElement;
private bool _first;

/// <summary>
/// Creates the module with the specified query selector and processing function.
/// </summary>
/// <param name="querySelector">The query selector to use.</param>
/// <param name="processElement">
/// A delegate to apply to each found element.
/// The <see cref="Dictionary{TKey, TValue}"/> holds any additional metadata that should be added to the document.
/// </param>
public ProcessHtml(
string querySelector,
Action<Common.IDocument, IExecutionContext, IElement, Dictionary<string, object>> processElement)
{
_querySelector = querySelector;
_processElement = processElement ?? throw new ArgumentNullException(nameof(processElement));
}

/// <summary>
/// Creates the module with the specified query selector and processing function.
/// </summary>
/// <param name="querySelector">The query selector to use.</param>
/// <param name="processElement">
/// A delegate to apply to each found element.
/// </param>
public ProcessHtml(string querySelector, Action<Common.IDocument, IExecutionContext, IElement> processElement)
: this(querySelector, (d, c, e, _) => processElement(d, c, e))
{
_ = processElement ?? throw new ArgumentNullException(nameof(processElement));
}

/// <summary>
/// Creates the module with the specified query selector and processing function.
/// </summary>
/// <param name="querySelector">The query selector to use.</param>
/// <param name="processElement">
/// A delegate to apply to each found element.
/// The <see cref="Dictionary{TKey, TValue}"/> holds any additional metadata that should be added to the document.
/// </param>
public ProcessHtml(string querySelector, Action<IElement, Dictionary<string, object>> processElement)
: this(querySelector, (_, __, e, m) => processElement(e, m))
{
_ = processElement ?? throw new ArgumentNullException(nameof(processElement));
}

/// <summary>
/// Creates the module with the specified query selector and processing function.
/// </summary>
/// <param name="querySelector">The query selector to use.</param>
/// <param name="processElement">
/// A delegate to apply to each found element.
/// </param>
public ProcessHtml(string querySelector, Action<IElement> processElement)
: this(querySelector, (_, __, e, m) => processElement(e))
{
_ = processElement ?? throw new ArgumentNullException(nameof(processElement));
}

/// <summary>
/// Specifies that only the first query result should be processed (the default is <c>false</c>).
/// </summary>
/// <param name="first">If set to <c>true</c>, only the first result is processed.</param>
/// <returns>The current module instance.</returns>
public ProcessHtml First(bool first = true)
{
_first = first;
return this;
}

protected override Task<IEnumerable<Common.IDocument>> ExecuteInputAsync(Common.IDocument input, IExecutionContext context) =>
ProcessElementsAsync(input, context, _querySelector, _first, _processElement);

internal static async Task<IEnumerable<Common.IDocument>> ProcessElementsAsync(
Common.IDocument input,
IExecutionContext context,
string querySelector,
bool first,
Action<Common.IDocument, IExecutionContext, IElement, Dictionary<string, object>> processElement)
{
// Parse the HTML content
IHtmlDocument htmlDocument = await input.ParseHtmlAsync(context, HtmlParser);
if (htmlDocument == null)
{
return input.Yield();
}

// Evaluate the query selector
try
{
if (!string.IsNullOrWhiteSpace(querySelector))
{
IElement[] elements = first
? new[] { htmlDocument.QuerySelector(querySelector) }
: htmlDocument.QuerySelectorAll(querySelector).ToArray();
if (elements.Length > 0 && elements[0] != null)
{
INode clone = htmlDocument.Clone(true); // Clone the document so we know if it changed
Dictionary<string, object> metadata = new Dictionary<string, object>();
foreach (IElement element in elements)
{
processElement(input, context, element, metadata);
}

if (htmlDocument.Equals(clone))
{
// Elements were not edited so return the original document or clone it with new metadata
return metadata.Count == 0 ? input.Yield() : input.Clone(metadata).Yield();
}

// Elements were edited so get the new content
using (Stream contentStream = await context.GetContentStreamAsync())
{
using (StreamWriter writer = contentStream.GetWriter())
{
htmlDocument.ToHtml(writer, ProcessingInstructionFormatter.Instance);
writer.Flush();
IContentProvider contentProvider = context.GetContentProvider(contentStream, MediaTypes.Html);
return metadata.Count == 0
? input.Clone(contentProvider).Yield()
: input.Clone(metadata, contentProvider).Yield();
}
}
}
}
return input.Yield();
}
catch (Exception ex)
{
context.LogWarning("Exception while processing HTML for {0}: {1}", input.ToSafeDisplayString(), ex.Message);
return input.Yield();
}
}
}
}
Loading

0 comments on commit 4f10aa4

Please sign in to comment.