Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 3 additions & 37 deletions .github/workflows/convert-and-publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,50 +23,16 @@ jobs:
Set-PSRepository -Name PSGallery -InstallationPolicy Trusted
Install-Module -Name OpenXML -Force -Scope CurrentUser

- name: Import module and convert all specs
- name: Build publish tree and Windows_Protocols.zip
shell: pwsh
working-directory: ${{ github.workspace }}
run: |
Import-Module .\AwakeCoding.OpenSpecs -Force
Get-OpenSpecCatalog |
Save-OpenSpecDocument -Format DOCX -OutputPath ./downloads-convert -Force |
Where-Object { $_.Status -in 'Downloaded', 'Exists' } |
Convert-OpenSpecToMarkdown -OutputPath ./converted-specs -Force -Parallel -ThrottleLimit 4

- name: Build publish directory and index
shell: pwsh
working-directory: ${{ github.workspace }}
run: |
Import-Module .\AwakeCoding.OpenSpecs -Force
$converted = Join-Path $PWD 'converted-specs'
$publish = Join-Path $PWD 'publish'
New-Item -Path $publish -ItemType Directory -Force | Out-Null
Get-ChildItem -LiteralPath $converted -Directory | ForEach-Object {
$name = $_.Name
$md = Join-Path $_.FullName "$name.md"
if (-not (Test-Path -LiteralPath $md)) { $md = Join-Path $_.FullName 'index.md' }
if (-not (Test-Path -LiteralPath $md)) { return }
$dest = Join-Path $publish $name
New-Item -Path $dest -ItemType Directory -Force | Out-Null
Copy-Item -LiteralPath $md -Destination (Join-Path $dest 'index.md') -Force
$media = Join-Path $_.FullName 'media'
if (Test-Path -LiteralPath $media -PathType Container) {
Copy-Item -LiteralPath $media -Destination $dest -Recurse -Force
}
}
Update-OpenSpecIndex -Path $publish

- name: Zip publish contents
shell: pwsh
working-directory: ${{ github.workspace }}
run: |
Compress-Archive -Path .\publish\* -DestinationPath .\publish.zip -Force
run: .\scripts\Build-Publish.ps1

- name: Upload publish artifact
uses: actions/upload-artifact@v4
with:
name: publish
path: publish.zip
path: Windows_Protocols.zip

- name: Push to orphaned publish branch
shell: pwsh
Expand Down
12 changes: 9 additions & 3 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,13 @@ Instructions for AI agents working in this repository. For user-facing usage and

## Project summary

This repo is a PowerShell module that discovers and downloads Microsoft Open Specifications (Windows Protocols) from Learn and converts DOCX/PDF documents to strict GFM Markdown. There is no separate build: the module is `AwakeCoding.OpenSpecs.psd1` + `AwakeCoding.OpenSpecs.psm1` plus dot-sourced `Public/*.ps1` and `Private/*.ps1` on load. Target runtimes are PowerShell 5.1 and 7 (PSEditions Desktop and Core).
This repo is a PowerShell module that discovers and downloads Microsoft Open Specifications (Windows Protocols) from Learn and converts DOCX/PDF documents to strict GFM Markdown. There is no separate build: the module is `AwakeCoding.OpenSpecs.psd1` + `AwakeCoding.OpenSpecs.psm1` plus dot-sourced `Public/*.ps1` and `Private/*.ps1` on load.

## PowerShell version (required)

- **PowerShell 7 only.** Use the latest stable PowerShell 7 (pwsh) at all times. This is mandatory.
- **Windows PowerShell (5.1) compatibility is not a goal and is forbidden.** Do not add workarounds, conditional logic, or compatibility shims for Windows PowerShell. Code must assume PowerShell 7+ exclusively.
- Run all scripts, tests, and module commands with `pwsh`, not `powershell.exe`. CI, local development, and any tooling must target PowerShell 7.

## File and directory structure

Expand Down Expand Up @@ -43,7 +49,7 @@ Tests use Pester 5. From repo root:
Invoke-Pester ./tests
```

Use PowerShell 7 when possible for consistency with CI. Some tests are tagged `Live` and hit the network (Find-OpenSpec, Get-OpenSpecDownloadLink). To skip them:
Use PowerShell 7 (required; see above). Some tests are tagged `Live` and hit the network (Find-OpenSpec, Get-OpenSpecDownloadLink). To skip them:

```powershell
Invoke-Pester ./tests -Tag '!Live'
Expand All @@ -55,4 +61,4 @@ When you add a new exported function, add its name to the `$expected` array in t

- Do not remove or rename exported functions without updating `AwakeCoding.OpenSpecs.psd1` and the exports test.
- Conversion: DOCX is handled in-module via OpenXML; PDF uses external `docling` or `markitdown` when available (see `AwakeCoding.OpenSpecs/Private/Get-OpenSpecToolchain.ps1`). Output is textual (tables, ASCII), not image-based.
- For bulk or CI conversions, use `-Parallel -ThrottleLimit N` on PowerShell 7 with `Convert-OpenSpecToMarkdown` or `Invoke-OpenSpecConversionPipeline`.
- For bulk or CI conversions, use `-Parallel -ThrottleLimit N` with `Convert-OpenSpecToMarkdown` or `Invoke-OpenSpecConversionPipeline` (PowerShell 7 only).
173 changes: 170 additions & 3 deletions AwakeCoding.OpenSpecs/Private/ConvertFrom-OpenSpecDocx.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,24 @@ function ConvertFrom-OpenSpecDocxWithOpenXml {
$relationshipMap = Get-OpenSpecOpenXmlRelationshipMap -Archive $archive
$lines = New-Object System.Collections.Generic.List[string]
$emittedAnchors = [System.Collections.Generic.HashSet[string]]::new([System.StringComparer]::OrdinalIgnoreCase)
$linkMetadata = [ordered]@{
GuidToSection = @{}
SectionToTitle = @{}
TocAlias = @{}
GuidToGlossarySlug = @{}
InternalHyperlinks = New-Object System.Collections.Generic.List[object]
Stats = [ordered]@{
ParagraphCount = 0
HeadingCount = 0
BookmarkCount = 0
InternalHyperlinkCount = 0
GuidSectionMapCount = 0
TocAliasCount = 0
GlossaryGuidMapCount = 0
}
}
$inGlossary = $false
$glossaryHeadingLevel = 0

# Resolve media output directory for image extraction.
$resolvedMediaDir = $null
Expand All @@ -101,10 +119,12 @@ function ConvertFrom-OpenSpecDocxWithOpenXml {

foreach ($child in $body.ChildNodes) {
if ($child.LocalName -eq 'p') {
$linkMetadata.Stats.ParagraphCount++
$text = ConvertFrom-OpenSpecOpenXmlParagraph -ParagraphNode $child -NamespaceManager $nsmgr -RelationshipMap $relationshipMap -Archive $archive -MediaOutputDirectory $resolvedMediaDir
$styleNode = $child.SelectSingleNode('./w:pPr/w:pStyle', $nsmgr)
$style = if ($styleNode -and $styleNode.Attributes) { $styleNode.GetAttribute('val', 'http://schemas.openxmlformats.org/wordprocessingml/2006/main') } else { '' }
$anchors = Get-OpenSpecOpenXmlParagraphAnchors -ParagraphNode $child -NamespaceManager $nsmgr -ParagraphText $text -HeadingStyle $style
$anchorInfo = Get-OpenSpecOpenXmlParagraphAnchorInfo -ParagraphNode $child -NamespaceManager $nsmgr -ParagraphText $text -HeadingStyle $style
$anchors = @($anchorInfo.Anchors)

foreach ($anchor in $anchors) {
if ([string]::IsNullOrWhiteSpace($anchor)) {
Expand All @@ -117,6 +137,13 @@ function ConvertFrom-OpenSpecDocxWithOpenXml {

$lines.Add(('<a id="' + $anchor + '"></a>'))
}
$linkMetadata.Stats.BookmarkCount += @($anchorInfo.BookmarkNames).Count

$internalLinks = Get-OpenSpecOpenXmlParagraphInternalHyperlinks -ParagraphNode $child -NamespaceManager $nsmgr
foreach ($internalLink in $internalLinks) {
[void]$linkMetadata.InternalHyperlinks.Add($internalLink)
}
$linkMetadata.Stats.InternalHyperlinkCount += @($internalLinks).Count

$numberingNode = $child.SelectSingleNode('./w:pPr/w:numPr', $nsmgr)
if ([string]::IsNullOrWhiteSpace($text)) {
Expand All @@ -128,11 +155,42 @@ function ConvertFrom-OpenSpecDocxWithOpenXml {

if ($style -match '^Heading(?<level>[1-6])$') {
$level = [int]$Matches['level']
$linkMetadata.Stats.HeadingCount++
# Strip bold from heading text — the heading style (#) already implies bold.
# Keep italic and code formatting if present.
$headingText = ($text -replace '\*\*(?!\*)', '').Trim()
$lines.Add((('{0} ' -f ('#' * $level)) + $headingText))
$lines.Add('')

$isGlossaryHeading = $headingText -match '(?i)^\d+(?:\.\d+)*\s+Glossary$'
if ($isGlossaryHeading) {
$inGlossary = $true
$glossaryHeadingLevel = $level
}
elseif ($inGlossary -and $level -le $glossaryHeadingLevel) {
$inGlossary = $false
}

$sectionAnchor = $anchorInfo.SectionAnchor
if (-not [string]::IsNullOrWhiteSpace($sectionAnchor)) {
if (-not $linkMetadata.SectionToTitle.ContainsKey($sectionAnchor)) {
$linkMetadata.SectionToTitle[$sectionAnchor] = $headingText
}

foreach ($bookmarkName in @($anchorInfo.BookmarkNames)) {
if ($bookmarkName -match '(?i)^section_(?<guid>[a-f0-9]{32})$') {
$guid = $Matches['guid'].ToLowerInvariant()
if (-not $linkMetadata.GuidToSection.ContainsKey($guid)) {
$linkMetadata.GuidToSection[$guid] = $sectionAnchor
}
}
elseif ($bookmarkName -match '^_Toc\d+$') {
if (-not $linkMetadata.TocAlias.ContainsKey($bookmarkName)) {
$linkMetadata.TocAlias[$bookmarkName] = $sectionAnchor
}
}
}
}
}
elseif ($numberingNode) {
$lines.Add(('- ' + $text.Trim()))
Expand All @@ -141,6 +199,22 @@ function ConvertFrom-OpenSpecDocxWithOpenXml {
$lines.Add($text.Trim())
$lines.Add('')
}

if ($inGlossary) {
$defMatch = [regex]::Match($text, '^\s*\*\*(?<term>[^*]+)\*\*\s*:\s*')
if ($defMatch.Success) {
$term = $defMatch.Groups['term'].Value.Trim()
$slug = Get-OpenSpecGlossarySlugFromTerm -Term $term
foreach ($bookmarkName in @($anchorInfo.BookmarkNames)) {
if ($bookmarkName -match '(?i)^gt_(?<guid>[a-f0-9\-]{36})$') {
$guid = $Matches['guid'].ToLowerInvariant()
if (-not $linkMetadata.GuidToGlossarySlug.ContainsKey($guid)) {
$linkMetadata.GuidToGlossarySlug[$guid] = $slug
}
}
}
}
}
}
elseif ($child.LocalName -eq 'tbl') {
$tableLines = ConvertFrom-OpenSpecOpenXmlTable -TableNode $child -NamespaceManager $nsmgr -RelationshipMap $relationshipMap -Archive $archive -MediaOutputDirectory $resolvedMediaDir
Expand All @@ -157,6 +231,11 @@ function ConvertFrom-OpenSpecDocxWithOpenXml {
throw 'OpenXml conversion produced empty markdown output.'
}

$linkMetadata.Stats.GuidSectionMapCount = $linkMetadata.GuidToSection.Count
$linkMetadata.Stats.TocAliasCount = $linkMetadata.TocAlias.Count
$linkMetadata.Stats.GlossaryGuidMapCount = $linkMetadata.GuidToGlossarySlug.Count
$notes.Add("Link metadata captured: guidToSection=$($linkMetadata.Stats.GuidSectionMapCount), tocAlias=$($linkMetadata.Stats.TocAliasCount), guidToGlossarySlug=$($linkMetadata.Stats.GlossaryGuidMapCount), internalLinks=$($linkMetadata.Stats.InternalHyperlinkCount).")

$markdown | Set-Content -LiteralPath $OutputPath -Encoding UTF8
}
finally {
Expand All @@ -165,11 +244,21 @@ function ConvertFrom-OpenSpecDocxWithOpenXml {
}
}

$linkMetadataOut = [ordered]@{
GuidToSection = $linkMetadata.GuidToSection
SectionToTitle = $linkMetadata.SectionToTitle
TocAlias = $linkMetadata.TocAlias
GuidToGlossarySlug = $linkMetadata.GuidToGlossarySlug
InternalHyperlinks = @($linkMetadata.InternalHyperlinks.ToArray())
Stats = $linkMetadata.Stats
}

return [pscustomobject]@{
PSTypeName = 'AwakeCoding.OpenSpecs.ConversionStep'
Strategy = 'openxml-docx'
OutputPath = $OutputPath
Notes = $notes.ToArray()
LinkMetadata = $linkMetadataOut
}
}

Expand Down Expand Up @@ -679,7 +768,7 @@ function ConvertFrom-OpenSpecOpenXmlRunText {
return ($parts.ToArray() -join '')
}

function Get-OpenSpecOpenXmlParagraphAnchors {
function Get-OpenSpecOpenXmlParagraphAnchorInfo {
[CmdletBinding()]
param(
[Parameter(Mandatory)]
Expand All @@ -696,6 +785,8 @@ function Get-OpenSpecOpenXmlParagraphAnchors {
)

$anchors = New-Object System.Collections.Generic.List[string]
$bookmarkNames = New-Object System.Collections.Generic.List[string]
$sectionAnchor = $null

$bookmarkNodes = $ParagraphNode.SelectNodes('.//w:bookmarkStart', $NamespaceManager)
foreach ($bookmarkNode in $bookmarkNodes) {
Expand All @@ -709,6 +800,7 @@ function Get-OpenSpecOpenXmlParagraphAnchors {
}

$anchors.Add($bookmarkName)
$bookmarkNames.Add($bookmarkName)
}

if ($HeadingStyle -match '^Heading[1-6]$') {
Expand All @@ -718,7 +810,82 @@ function Get-OpenSpecOpenXmlParagraphAnchors {
}
}

return @($anchors.ToArray() | Select-Object -Unique)
[pscustomobject]@{
Anchors = @($anchors.ToArray() | Select-Object -Unique)
BookmarkNames = @($bookmarkNames.ToArray() | Select-Object -Unique)
SectionAnchor = $sectionAnchor
}
}

function Get-OpenSpecOpenXmlParagraphAnchors {
[CmdletBinding()]
param(
[Parameter(Mandatory)]
[System.Xml.XmlNode]$ParagraphNode,

[Parameter(Mandatory)]
[System.Xml.XmlNamespaceManager]$NamespaceManager,

[Parameter()]
[string]$ParagraphText,

[Parameter()]
[string]$HeadingStyle
)

$info = Get-OpenSpecOpenXmlParagraphAnchorInfo -ParagraphNode $ParagraphNode -NamespaceManager $NamespaceManager -ParagraphText $ParagraphText -HeadingStyle $HeadingStyle
return @($info.Anchors)
}

function Get-OpenSpecOpenXmlParagraphInternalHyperlinks {
[CmdletBinding()]
param(
[Parameter(Mandatory)]
[System.Xml.XmlNode]$ParagraphNode,

[Parameter(Mandatory)]
[System.Xml.XmlNamespaceManager]$NamespaceManager
)

$links = New-Object System.Collections.Generic.List[object]
$hyperlinkNodes = $ParagraphNode.SelectNodes('.//w:hyperlink[@w:anchor]', $NamespaceManager)
foreach ($hyperlinkNode in $hyperlinkNodes) {
$anchor = $hyperlinkNode.GetAttribute('anchor', 'http://schemas.openxmlformats.org/wordprocessingml/2006/main')
if ([string]::IsNullOrWhiteSpace($anchor)) {
continue
}

$textNodes = $hyperlinkNode.SelectNodes('.//w:t', $NamespaceManager)
$parts = New-Object System.Collections.Generic.List[string]
foreach ($textNode in $textNodes) {
if (-not [string]::IsNullOrWhiteSpace($textNode.InnerText)) {
[void]$parts.Add($textNode.InnerText)
}
}
$text = (($parts.ToArray() -join '') -replace '\s+', ' ').Trim()

[void]$links.Add([pscustomobject]@{
Anchor = $anchor
Text = $text
})
}

return @($links.ToArray())
}

function Get-OpenSpecGlossarySlugFromTerm {
[CmdletBinding()]
param(
[Parameter(Mandatory)]
[string]$Term
)

$slug = $Term -replace '\s+', '-' -replace '[^\w\-]', '' -replace '-+', '-' -replace '^-|-$', ''
$slug = $slug.ToLowerInvariant()
if ([string]::IsNullOrWhiteSpace($slug)) {
$slug = 'term'
}
return "gt_$slug"
}

function Get-OpenSpecSectionAnchorFromHeadingText {
Expand Down
Loading