-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add the ability to convert a WordPress export file into n markdown files representing each blog post (published and draft). Metadata (categories, tags, author info) is contained in YAML frontmatter. Images in the post have been added as attachments in the frontmatter and are also automatically downloaded. Image urls and internal links are re-written to be relative (but rooted).
- Loading branch information
1 parent
1d957b1
commit 813a697
Showing
43 changed files
with
1,067 additions
and
46 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
2 changes: 1 addition & 1 deletion
2
Solutions/Stacker.Cli.Specs/Features/WordPressExportToTwitter.feature.cs
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
34 changes: 34 additions & 0 deletions
34
Solutions/Stacker.Cli/Cleaners/ContentItemAttachementPathCleaner.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
// <copyright file="ContentItemAttachementPathCleaner.cs" company="Endjin Limited"> | ||
// Copyright (c) Endjin Limited. All rights reserved. | ||
// </copyright> | ||
|
||
namespace Stacker.Cli.Cleaners | ||
{ | ||
using System; | ||
using System.Text.RegularExpressions; | ||
using Flurl; | ||
using Stacker.Cli.Domain.Universal; | ||
|
||
public class ContentItemAttachementPathCleaner : IPreDownloadCleaner | ||
{ | ||
public ContentItem Clean(ContentItem contentItem) | ||
{ | ||
string pattern = @"(https?:\/\/(?:(?:blogs?.endjin.com)|(?:endjinblog.azurewebsites.net))\/wp-content\/uploads)"; | ||
string path = "/assets/images/blog"; | ||
|
||
Regex regexp = new Regex(pattern, RegexOptions.Compiled, TimeSpan.FromSeconds(1)); | ||
|
||
foreach (var attachment in contentItem.Content.Attachments) | ||
{ | ||
attachment.Path = regexp.Replace(attachment.Path, path); | ||
|
||
if (!attachment.Path.StartsWith(path, StringComparison.InvariantCultureIgnoreCase)) | ||
{ | ||
attachment.Path = Url.Combine(path, attachment.Path); | ||
} | ||
} | ||
|
||
return contentItem; | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
// <copyright file="ContentItemCleaner.cs" company="Endjin Limited"> | ||
// Copyright (c) Endjin Limited. All rights reserved. | ||
// </copyright> | ||
|
||
namespace Stacker.Cli.Cleaners | ||
{ | ||
using System; | ||
using System.Linq; | ||
using Microsoft.Extensions.DependencyInjection; | ||
using Stacker.Cli.Domain.Universal; | ||
|
||
public class ContentItemCleaner | ||
{ | ||
private readonly IServiceProvider serviceProvider; | ||
|
||
public ContentItemCleaner(IServiceProvider serviceProvider) | ||
{ | ||
this.serviceProvider = serviceProvider; | ||
} | ||
|
||
public ContentItem PreDownload(ContentItem content) | ||
{ | ||
var cleaners = this.serviceProvider.GetServices<IPreDownloadCleaner>(); | ||
return cleaners.Aggregate(content, (current, cleaner) => cleaner.Clean(current)); | ||
} | ||
|
||
public ContentItem PostDownload(ContentItem content) | ||
{ | ||
var cleaners = this.serviceProvider.GetServices<IPostDownloadCleaner>(); | ||
return cleaners.Aggregate(content, (current, cleaner) => cleaner.Clean(current)); | ||
} | ||
|
||
internal string PostConvert(string content) | ||
{ | ||
var cleaners = this.serviceProvider.GetServices<IPostConvertCleaner>(); | ||
return cleaners.Aggregate(content, (current, cleaner) => cleaner.Clean(current)); | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
// <copyright file="EnsureEndjinHttpsInBody.cs" company="Endjin Limited"> | ||
// Copyright (c) Endjin Limited. All rights reserved. | ||
// </copyright> | ||
|
||
namespace Stacker.Cli.Cleaners | ||
{ | ||
using System; | ||
using System.Text.RegularExpressions; | ||
using Stacker.Cli.Domain.Universal; | ||
|
||
public class EnsureEndjinHttpsInBody : IPreDownloadCleaner | ||
{ | ||
public ContentItem Clean(ContentItem contentItem) | ||
{ | ||
string pattern = @"(http:\/\/endjin.com)"; | ||
|
||
Regex regexp = new Regex(pattern, RegexOptions.Compiled, TimeSpan.FromSeconds(1)); | ||
|
||
contentItem.Content.Body = regexp.Replace(contentItem.Content.Body, "https://endjin.com"); | ||
|
||
return contentItem; | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
// <copyright file="IPostConvertCleaner.cs" company="Endjin Limited"> | ||
// Copyright (c) Endjin Limited. All rights reserved. | ||
// </copyright> | ||
|
||
namespace Stacker.Cli.Cleaners | ||
{ | ||
public interface IPostConvertCleaner | ||
{ | ||
string Clean(string content); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
// <copyright file="IPostDownloadCleaner.cs" company="Endjin Limited"> | ||
// Copyright (c) Endjin Limited. All rights reserved. | ||
// </copyright> | ||
|
||
namespace Stacker.Cli.Cleaners | ||
{ | ||
using Stacker.Cli.Domain.Universal; | ||
|
||
public interface IPostDownloadCleaner | ||
{ | ||
ContentItem Clean(ContentItem contentItem); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
// <copyright file="IPreDownloadCleaner.cs" company="Endjin Limited"> | ||
// Copyright (c) Endjin Limited. All rights reserved. | ||
// </copyright> | ||
|
||
namespace Stacker.Cli.Cleaners | ||
{ | ||
using Stacker.Cli.Domain.Universal; | ||
|
||
public interface IPreDownloadCleaner | ||
{ | ||
ContentItem Clean(ContentItem contentItem); | ||
} | ||
} |
19 changes: 19 additions & 0 deletions
19
Solutions/Stacker.Cli/Cleaners/RemoveHeaderImageFromBody.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
// <copyright file="RemoveHeaderImageFromBody.cs" company="Endjin Limited"> | ||
// Copyright (c) Endjin Limited. All rights reserved. | ||
// </copyright> | ||
|
||
namespace Stacker.Cli.Cleaners | ||
{ | ||
using System; | ||
using System.Text.RegularExpressions; | ||
|
||
public class RemoveHeaderImageFromBody : IPostConvertCleaner | ||
{ | ||
public string Clean(string content) | ||
{ | ||
Regex regexp = new Regex(@"(\[?!\[.*\]\(.*\))", RegexOptions.Compiled, TimeSpan.FromSeconds(1)); | ||
|
||
return regexp.Replace(content, string.Empty, 1, 0); | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
// <copyright file="RemoveHostNamesFromBody.cs" company="Endjin Limited"> | ||
// Copyright (c) Endjin Limited. All rights reserved. | ||
// </copyright> | ||
|
||
namespace Stacker.Cli.Cleaners | ||
{ | ||
using System; | ||
using System.Text.RegularExpressions; | ||
using Stacker.Cli.Domain.Universal; | ||
|
||
public class RemoveHostNamesFromBody : IPostDownloadCleaner | ||
{ | ||
public ContentItem Clean(ContentItem contentItem) | ||
{ | ||
string pattern = @"(https?:\/\/(?:(?:blogs?.endjin.com)|(?:endjinblog.azurewebsites.net)))"; | ||
|
||
Regex regexp = new Regex(pattern, RegexOptions.Compiled, TimeSpan.FromSeconds(1)); | ||
|
||
contentItem.Content.Body = regexp.Replace(contentItem.Content.Body, string.Empty); | ||
|
||
return contentItem; | ||
} | ||
} | ||
} |
19 changes: 19 additions & 0 deletions
19
Solutions/Stacker.Cli/Cleaners/RemoveThreeBlankLinesFromStartBody.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
// <copyright file="RemoveThreeBlankLinesFromStartBody.cs" company="Endjin Limited"> | ||
// Copyright (c) Endjin Limited. All rights reserved. | ||
// </copyright> | ||
|
||
namespace Stacker.Cli.Cleaners | ||
{ | ||
using System; | ||
using System.Text.RegularExpressions; | ||
|
||
public class RemoveThreeBlankLinesFromStartBody : IPostConvertCleaner | ||
{ | ||
public string Clean(string content) | ||
{ | ||
Regex regexp = new Regex(@"(\r\n){3,3}", RegexOptions.Compiled, TimeSpan.FromSeconds(1)); | ||
|
||
return regexp.Replace(content, Environment.NewLine); | ||
} | ||
} | ||
} |
18 changes: 18 additions & 0 deletions
18
Solutions/Stacker.Cli/Cleaners/ReplaceNewLineWithParagraphTagCleaner.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
// <copyright file="ReplaceNewLineWithParagraphTagCleaner.cs" company="Endjin Limited"> | ||
// Copyright (c) Endjin Limited. All rights reserved. | ||
// </copyright> | ||
|
||
namespace Stacker.Cli.Cleaners | ||
{ | ||
using Stacker.Cli.Domain.Universal; | ||
|
||
public class ReplaceNewLineWithParagraphTagCleaner : IPreDownloadCleaner | ||
{ | ||
public ContentItem Clean(ContentItem contentItem) | ||
{ | ||
contentItem.Content.Body = contentItem.Content.Body.Replace("\n", "<p/>"); | ||
|
||
return contentItem; | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
// <copyright file="ReplaceSmartQuotes.cs" company="Endjin Limited"> | ||
// Copyright (c) Endjin Limited. All rights reserved. | ||
// </copyright> | ||
|
||
namespace Stacker.Cli.Cleaners | ||
{ | ||
public class ReplaceSmartQuotes : IPostConvertCleaner | ||
{ | ||
public string Clean(string content) | ||
{ | ||
return content.Replace("“", "\"").Replace("”", "\"").Replace("’", "'").Replace("‘", "'"); | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
// <copyright file="ReplaceWpUploadPath.cs" company="Endjin Limited"> | ||
// Copyright (c) Endjin Limited. All rights reserved. | ||
// </copyright> | ||
|
||
namespace Stacker.Cli.Cleaners | ||
{ | ||
using System; | ||
using System.Text.RegularExpressions; | ||
|
||
public class ReplaceWpUploadPath : IPostConvertCleaner | ||
{ | ||
public string Clean(string content) | ||
{ | ||
Regex regexp = new Regex(@"\((\/wp-content\/uploads\/)", RegexOptions.Compiled, TimeSpan.FromSeconds(1)); | ||
|
||
return regexp.Replace(content, "(/assets/images/blog/"); | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
// <copyright file="UpdateInternalPostUrls.cs" company="Endjin Limited"> | ||
// Copyright (c) Endjin Limited. All rights reserved. | ||
// </copyright> | ||
|
||
namespace Stacker.Cli.Cleaners | ||
{ | ||
using System; | ||
using System.Text.RegularExpressions; | ||
|
||
public class UpdateInternalPostUrls : IPostConvertCleaner | ||
{ | ||
public string Clean(string content) | ||
{ | ||
Regex regexp = new Regex(@"\((\/\d{4}\/\d{2}\/.*?)(\/)", RegexOptions.Compiled, TimeSpan.FromSeconds(1)); | ||
|
||
content = regexp.Replace(content, (match) => | ||
{ | ||
Group group = match.Groups[1]; | ||
return $"(/blog{group.Value}.html"; | ||
}); | ||
|
||
return content; | ||
} | ||
} | ||
} |
27 changes: 27 additions & 0 deletions
27
Solutions/Stacker.Cli/Cleaners/WordPressImageResizerCleaner.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
// <copyright file="WordPressImageResizerCleaner.cs" company="Endjin Limited"> | ||
// Copyright (c) Endjin Limited. All rights reserved. | ||
// </copyright> | ||
|
||
namespace Stacker.Cli.Cleaners | ||
{ | ||
using System.Text.RegularExpressions; | ||
using Stacker.Cli.Domain.Universal; | ||
|
||
public class WordPressImageResizerCleaner : IPreDownloadCleaner | ||
{ | ||
public ContentItem Clean(ContentItem contentItem) | ||
{ | ||
string pattern = @"(-\d+?x\d+?|(_thumb(\d+?)?))(?=.png|.jpg)"; | ||
|
||
contentItem.Content.Body = Regex.Replace(contentItem.Content.Body, pattern, string.Empty); | ||
|
||
foreach (var attachment in contentItem.Content.Attachments) | ||
{ | ||
attachment.Path = Regex.Replace(attachment.Path, pattern, string.Empty); | ||
attachment.Url = Regex.Replace(attachment.Url, pattern, string.Empty); | ||
} | ||
|
||
return contentItem; | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.