diff --git a/.github/workflows/convert-and-publish.yml b/.github/workflows/convert-and-publish.yml index bb40d92f..75fa8553 100644 --- a/.github/workflows/convert-and-publish.yml +++ b/.github/workflows/convert-and-publish.yml @@ -23,50 +23,16 @@ jobs: Set-PSRepository -Name PSGallery -InstallationPolicy Trusted Install-Module -Name OpenXML -Force -Scope CurrentUser - - name: Import module and convert all specs + - name: Build publish tree and Windows_Protocols.zip shell: pwsh working-directory: ${{ github.workspace }} - run: | - Import-Module .\AwakeCoding.OpenSpecs -Force - Get-OpenSpecCatalog | - Save-OpenSpecDocument -Format DOCX -OutputPath ./downloads-convert -Force | - Where-Object { $_.Status -in 'Downloaded', 'Exists' } | - Convert-OpenSpecToMarkdown -OutputPath ./converted-specs -Force -Parallel -ThrottleLimit 4 - - - name: Build publish directory and index - shell: pwsh - working-directory: ${{ github.workspace }} - run: | - Import-Module .\AwakeCoding.OpenSpecs -Force - $converted = Join-Path $PWD 'converted-specs' - $publish = Join-Path $PWD 'publish' - New-Item -Path $publish -ItemType Directory -Force | Out-Null - Get-ChildItem -LiteralPath $converted -Directory | ForEach-Object { - $name = $_.Name - $md = Join-Path $_.FullName "$name.md" - if (-not (Test-Path -LiteralPath $md)) { $md = Join-Path $_.FullName 'index.md' } - if (-not (Test-Path -LiteralPath $md)) { return } - $dest = Join-Path $publish $name - New-Item -Path $dest -ItemType Directory -Force | Out-Null - Copy-Item -LiteralPath $md -Destination (Join-Path $dest 'index.md') -Force - $media = Join-Path $_.FullName 'media' - if (Test-Path -LiteralPath $media -PathType Container) { - Copy-Item -LiteralPath $media -Destination $dest -Recurse -Force - } - } - Update-OpenSpecIndex -Path $publish - - - name: Zip publish contents - shell: pwsh - working-directory: ${{ github.workspace }} - run: | - Compress-Archive -Path .\publish\* -DestinationPath .\publish.zip -Force + run: .\scripts\Build-Publish.ps1 - name: Upload publish artifact uses: actions/upload-artifact@v4 with: name: publish - path: publish.zip + path: Windows_Protocols.zip - name: Push to orphaned publish branch shell: pwsh diff --git a/AGENTS.md b/AGENTS.md index 1f844450..bd2b6fde 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -4,7 +4,13 @@ Instructions for AI agents working in this repository. For user-facing usage and ## Project summary -This repo is a PowerShell module that discovers and downloads Microsoft Open Specifications (Windows Protocols) from Learn and converts DOCX/PDF documents to strict GFM Markdown. There is no separate build: the module is `AwakeCoding.OpenSpecs.psd1` + `AwakeCoding.OpenSpecs.psm1` plus dot-sourced `Public/*.ps1` and `Private/*.ps1` on load. Target runtimes are PowerShell 5.1 and 7 (PSEditions Desktop and Core). +This repo is a PowerShell module that discovers and downloads Microsoft Open Specifications (Windows Protocols) from Learn and converts DOCX/PDF documents to strict GFM Markdown. There is no separate build: the module is `AwakeCoding.OpenSpecs.psd1` + `AwakeCoding.OpenSpecs.psm1` plus dot-sourced `Public/*.ps1` and `Private/*.ps1` on load. + +## PowerShell version (required) + +- **PowerShell 7 only.** Use the latest stable PowerShell 7 (pwsh) at all times. This is mandatory. +- **Windows PowerShell (5.1) compatibility is not a goal and is forbidden.** Do not add workarounds, conditional logic, or compatibility shims for Windows PowerShell. Code must assume PowerShell 7+ exclusively. +- Run all scripts, tests, and module commands with `pwsh`, not `powershell.exe`. CI, local development, and any tooling must target PowerShell 7. ## File and directory structure @@ -43,7 +49,7 @@ Tests use Pester 5. From repo root: Invoke-Pester ./tests ``` -Use PowerShell 7 when possible for consistency with CI. Some tests are tagged `Live` and hit the network (Find-OpenSpec, Get-OpenSpecDownloadLink). To skip them: +Use PowerShell 7 (required; see above). Some tests are tagged `Live` and hit the network (Find-OpenSpec, Get-OpenSpecDownloadLink). To skip them: ```powershell Invoke-Pester ./tests -Tag '!Live' @@ -55,4 +61,4 @@ When you add a new exported function, add its name to the `$expected` array in t - Do not remove or rename exported functions without updating `AwakeCoding.OpenSpecs.psd1` and the exports test. - Conversion: DOCX is handled in-module via OpenXML; PDF uses external `docling` or `markitdown` when available (see `AwakeCoding.OpenSpecs/Private/Get-OpenSpecToolchain.ps1`). Output is textual (tables, ASCII), not image-based. -- For bulk or CI conversions, use `-Parallel -ThrottleLimit N` on PowerShell 7 with `Convert-OpenSpecToMarkdown` or `Invoke-OpenSpecConversionPipeline`. +- For bulk or CI conversions, use `-Parallel -ThrottleLimit N` with `Convert-OpenSpecToMarkdown` or `Invoke-OpenSpecConversionPipeline` (PowerShell 7 only). diff --git a/AwakeCoding.OpenSpecs/Private/ConvertFrom-OpenSpecDocx.ps1 b/AwakeCoding.OpenSpecs/Private/ConvertFrom-OpenSpecDocx.ps1 index 2fce8e68..9a2859f9 100644 --- a/AwakeCoding.OpenSpecs/Private/ConvertFrom-OpenSpecDocx.ps1 +++ b/AwakeCoding.OpenSpecs/Private/ConvertFrom-OpenSpecDocx.ps1 @@ -92,6 +92,24 @@ function ConvertFrom-OpenSpecDocxWithOpenXml { $relationshipMap = Get-OpenSpecOpenXmlRelationshipMap -Archive $archive $lines = New-Object System.Collections.Generic.List[string] $emittedAnchors = [System.Collections.Generic.HashSet[string]]::new([System.StringComparer]::OrdinalIgnoreCase) + $linkMetadata = [ordered]@{ + GuidToSection = @{} + SectionToTitle = @{} + TocAlias = @{} + GuidToGlossarySlug = @{} + InternalHyperlinks = New-Object System.Collections.Generic.List[object] + Stats = [ordered]@{ + ParagraphCount = 0 + HeadingCount = 0 + BookmarkCount = 0 + InternalHyperlinkCount = 0 + GuidSectionMapCount = 0 + TocAliasCount = 0 + GlossaryGuidMapCount = 0 + } + } + $inGlossary = $false + $glossaryHeadingLevel = 0 # Resolve media output directory for image extraction. $resolvedMediaDir = $null @@ -101,10 +119,12 @@ function ConvertFrom-OpenSpecDocxWithOpenXml { foreach ($child in $body.ChildNodes) { if ($child.LocalName -eq 'p') { + $linkMetadata.Stats.ParagraphCount++ $text = ConvertFrom-OpenSpecOpenXmlParagraph -ParagraphNode $child -NamespaceManager $nsmgr -RelationshipMap $relationshipMap -Archive $archive -MediaOutputDirectory $resolvedMediaDir $styleNode = $child.SelectSingleNode('./w:pPr/w:pStyle', $nsmgr) $style = if ($styleNode -and $styleNode.Attributes) { $styleNode.GetAttribute('val', 'http://schemas.openxmlformats.org/wordprocessingml/2006/main') } else { '' } - $anchors = Get-OpenSpecOpenXmlParagraphAnchors -ParagraphNode $child -NamespaceManager $nsmgr -ParagraphText $text -HeadingStyle $style + $anchorInfo = Get-OpenSpecOpenXmlParagraphAnchorInfo -ParagraphNode $child -NamespaceManager $nsmgr -ParagraphText $text -HeadingStyle $style + $anchors = @($anchorInfo.Anchors) foreach ($anchor in $anchors) { if ([string]::IsNullOrWhiteSpace($anchor)) { @@ -117,6 +137,13 @@ function ConvertFrom-OpenSpecDocxWithOpenXml { $lines.Add(('')) } + $linkMetadata.Stats.BookmarkCount += @($anchorInfo.BookmarkNames).Count + + $internalLinks = Get-OpenSpecOpenXmlParagraphInternalHyperlinks -ParagraphNode $child -NamespaceManager $nsmgr + foreach ($internalLink in $internalLinks) { + [void]$linkMetadata.InternalHyperlinks.Add($internalLink) + } + $linkMetadata.Stats.InternalHyperlinkCount += @($internalLinks).Count $numberingNode = $child.SelectSingleNode('./w:pPr/w:numPr', $nsmgr) if ([string]::IsNullOrWhiteSpace($text)) { @@ -128,11 +155,42 @@ function ConvertFrom-OpenSpecDocxWithOpenXml { if ($style -match '^Heading(?[1-6])$') { $level = [int]$Matches['level'] + $linkMetadata.Stats.HeadingCount++ # Strip bold from heading text — the heading style (#) already implies bold. # Keep italic and code formatting if present. $headingText = ($text -replace '\*\*(?!\*)', '').Trim() $lines.Add((('{0} ' -f ('#' * $level)) + $headingText)) $lines.Add('') + + $isGlossaryHeading = $headingText -match '(?i)^\d+(?:\.\d+)*\s+Glossary$' + if ($isGlossaryHeading) { + $inGlossary = $true + $glossaryHeadingLevel = $level + } + elseif ($inGlossary -and $level -le $glossaryHeadingLevel) { + $inGlossary = $false + } + + $sectionAnchor = $anchorInfo.SectionAnchor + if (-not [string]::IsNullOrWhiteSpace($sectionAnchor)) { + if (-not $linkMetadata.SectionToTitle.ContainsKey($sectionAnchor)) { + $linkMetadata.SectionToTitle[$sectionAnchor] = $headingText + } + + foreach ($bookmarkName in @($anchorInfo.BookmarkNames)) { + if ($bookmarkName -match '(?i)^section_(?[a-f0-9]{32})$') { + $guid = $Matches['guid'].ToLowerInvariant() + if (-not $linkMetadata.GuidToSection.ContainsKey($guid)) { + $linkMetadata.GuidToSection[$guid] = $sectionAnchor + } + } + elseif ($bookmarkName -match '^_Toc\d+$') { + if (-not $linkMetadata.TocAlias.ContainsKey($bookmarkName)) { + $linkMetadata.TocAlias[$bookmarkName] = $sectionAnchor + } + } + } + } } elseif ($numberingNode) { $lines.Add(('- ' + $text.Trim())) @@ -141,6 +199,22 @@ function ConvertFrom-OpenSpecDocxWithOpenXml { $lines.Add($text.Trim()) $lines.Add('') } + + if ($inGlossary) { + $defMatch = [regex]::Match($text, '^\s*\*\*(?[^*]+)\*\*\s*:\s*') + if ($defMatch.Success) { + $term = $defMatch.Groups['term'].Value.Trim() + $slug = Get-OpenSpecGlossarySlugFromTerm -Term $term + foreach ($bookmarkName in @($anchorInfo.BookmarkNames)) { + if ($bookmarkName -match '(?i)^gt_(?[a-f0-9\-]{36})$') { + $guid = $Matches['guid'].ToLowerInvariant() + if (-not $linkMetadata.GuidToGlossarySlug.ContainsKey($guid)) { + $linkMetadata.GuidToGlossarySlug[$guid] = $slug + } + } + } + } + } } elseif ($child.LocalName -eq 'tbl') { $tableLines = ConvertFrom-OpenSpecOpenXmlTable -TableNode $child -NamespaceManager $nsmgr -RelationshipMap $relationshipMap -Archive $archive -MediaOutputDirectory $resolvedMediaDir @@ -157,6 +231,11 @@ function ConvertFrom-OpenSpecDocxWithOpenXml { throw 'OpenXml conversion produced empty markdown output.' } + $linkMetadata.Stats.GuidSectionMapCount = $linkMetadata.GuidToSection.Count + $linkMetadata.Stats.TocAliasCount = $linkMetadata.TocAlias.Count + $linkMetadata.Stats.GlossaryGuidMapCount = $linkMetadata.GuidToGlossarySlug.Count + $notes.Add("Link metadata captured: guidToSection=$($linkMetadata.Stats.GuidSectionMapCount), tocAlias=$($linkMetadata.Stats.TocAliasCount), guidToGlossarySlug=$($linkMetadata.Stats.GlossaryGuidMapCount), internalLinks=$($linkMetadata.Stats.InternalHyperlinkCount).") + $markdown | Set-Content -LiteralPath $OutputPath -Encoding UTF8 } finally { @@ -165,11 +244,21 @@ function ConvertFrom-OpenSpecDocxWithOpenXml { } } + $linkMetadataOut = [ordered]@{ + GuidToSection = $linkMetadata.GuidToSection + SectionToTitle = $linkMetadata.SectionToTitle + TocAlias = $linkMetadata.TocAlias + GuidToGlossarySlug = $linkMetadata.GuidToGlossarySlug + InternalHyperlinks = @($linkMetadata.InternalHyperlinks.ToArray()) + Stats = $linkMetadata.Stats + } + return [pscustomobject]@{ PSTypeName = 'AwakeCoding.OpenSpecs.ConversionStep' Strategy = 'openxml-docx' OutputPath = $OutputPath Notes = $notes.ToArray() + LinkMetadata = $linkMetadataOut } } @@ -679,7 +768,7 @@ function ConvertFrom-OpenSpecOpenXmlRunText { return ($parts.ToArray() -join '') } -function Get-OpenSpecOpenXmlParagraphAnchors { +function Get-OpenSpecOpenXmlParagraphAnchorInfo { [CmdletBinding()] param( [Parameter(Mandatory)] @@ -696,6 +785,8 @@ function Get-OpenSpecOpenXmlParagraphAnchors { ) $anchors = New-Object System.Collections.Generic.List[string] + $bookmarkNames = New-Object System.Collections.Generic.List[string] + $sectionAnchor = $null $bookmarkNodes = $ParagraphNode.SelectNodes('.//w:bookmarkStart', $NamespaceManager) foreach ($bookmarkNode in $bookmarkNodes) { @@ -709,6 +800,7 @@ function Get-OpenSpecOpenXmlParagraphAnchors { } $anchors.Add($bookmarkName) + $bookmarkNames.Add($bookmarkName) } if ($HeadingStyle -match '^Heading[1-6]$') { @@ -718,7 +810,82 @@ function Get-OpenSpecOpenXmlParagraphAnchors { } } - return @($anchors.ToArray() | Select-Object -Unique) + [pscustomobject]@{ + Anchors = @($anchors.ToArray() | Select-Object -Unique) + BookmarkNames = @($bookmarkNames.ToArray() | Select-Object -Unique) + SectionAnchor = $sectionAnchor + } +} + +function Get-OpenSpecOpenXmlParagraphAnchors { + [CmdletBinding()] + param( + [Parameter(Mandatory)] + [System.Xml.XmlNode]$ParagraphNode, + + [Parameter(Mandatory)] + [System.Xml.XmlNamespaceManager]$NamespaceManager, + + [Parameter()] + [string]$ParagraphText, + + [Parameter()] + [string]$HeadingStyle + ) + + $info = Get-OpenSpecOpenXmlParagraphAnchorInfo -ParagraphNode $ParagraphNode -NamespaceManager $NamespaceManager -ParagraphText $ParagraphText -HeadingStyle $HeadingStyle + return @($info.Anchors) +} + +function Get-OpenSpecOpenXmlParagraphInternalHyperlinks { + [CmdletBinding()] + param( + [Parameter(Mandatory)] + [System.Xml.XmlNode]$ParagraphNode, + + [Parameter(Mandatory)] + [System.Xml.XmlNamespaceManager]$NamespaceManager + ) + + $links = New-Object System.Collections.Generic.List[object] + $hyperlinkNodes = $ParagraphNode.SelectNodes('.//w:hyperlink[@w:anchor]', $NamespaceManager) + foreach ($hyperlinkNode in $hyperlinkNodes) { + $anchor = $hyperlinkNode.GetAttribute('anchor', 'http://schemas.openxmlformats.org/wordprocessingml/2006/main') + if ([string]::IsNullOrWhiteSpace($anchor)) { + continue + } + + $textNodes = $hyperlinkNode.SelectNodes('.//w:t', $NamespaceManager) + $parts = New-Object System.Collections.Generic.List[string] + foreach ($textNode in $textNodes) { + if (-not [string]::IsNullOrWhiteSpace($textNode.InnerText)) { + [void]$parts.Add($textNode.InnerText) + } + } + $text = (($parts.ToArray() -join '') -replace '\s+', ' ').Trim() + + [void]$links.Add([pscustomobject]@{ + Anchor = $anchor + Text = $text + }) + } + + return @($links.ToArray()) +} + +function Get-OpenSpecGlossarySlugFromTerm { + [CmdletBinding()] + param( + [Parameter(Mandatory)] + [string]$Term + ) + + $slug = $Term -replace '\s+', '-' -replace '[^\w\-]', '' -replace '-+', '-' -replace '^-|-$', '' + $slug = $slug.ToLowerInvariant() + if ([string]::IsNullOrWhiteSpace($slug)) { + $slug = 'term' + } + return "gt_$slug" } function Get-OpenSpecSectionAnchorFromHeadingText { diff --git a/AwakeCoding.OpenSpecs/Private/Get-OpenSpecGuidSectionMapFromLearn.ps1 b/AwakeCoding.OpenSpecs/Private/Get-OpenSpecGuidSectionMapFromLearn.ps1 new file mode 100644 index 00000000..6ef500c7 --- /dev/null +++ b/AwakeCoding.OpenSpecs/Private/Get-OpenSpecGuidSectionMapFromLearn.ps1 @@ -0,0 +1,69 @@ +<# +.SYNOPSIS + Builds a GUID-to-section map by fetching section pages from Microsoft Learn. +.DESCRIPTION + For Open Specs that have GuidToSection=0 from DOCX conversion (e.g. MS-RDPBCGR), + fetches each section page from Learn (openspecs/windows_protocols/protocolId/{guid-with-hyphens}), + parses the H1 for the section number (e.g. "2.2.1.4 Server MCS Connect Response PDU..."), + and returns a hashtable: guid_no_hyphens -> Section_N.N. +.PARAMETER ProtocolId + Protocol ID (e.g. MS-RDPBCGR). +.PARAMETER Guids + Array of 32-character hex GUIDs (no hyphens) to resolve. +.PARAMETER ThrottleSeconds + Delay between HTTP requests to avoid overloading Learn. Default 1. +.OUTPUTS + Hashtable: lowercase guid (no hyphens) -> Section_N.N +#> +function Get-OpenSpecGuidSectionMapFromLearn { + [CmdletBinding()] + param( + [Parameter(Mandatory)] + [string]$ProtocolId, + + [Parameter(Mandatory)] + [string[]]$Guids, + + [Parameter()] + [int]$ThrottleSeconds = 1 + ) + + $ErrorActionPreference = 'Stop' + + function ConvertTo-HyphenatedGuid { + param([string]$Hex32) + if ($Hex32.Length -ne 32) { return $null } + $Hex32.Substring(0, 8) + '-' + $Hex32.Substring(8, 4) + '-' + $Hex32.Substring(12, 4) + '-' + $Hex32.Substring(16, 4) + '-' + $Hex32.Substring(20, 12) + } + + $baseUrl = "https://learn.microsoft.com/en-us/openspecs/windows_protocols/$($ProtocolId.ToLowerInvariant())" + $map = @{} + $uniqueGuids = @($Guids | ForEach-Object { $_.ToLowerInvariant() } | Select-Object -Unique) + $total = $uniqueGuids.Count + $resolved = 0 + + foreach ($i in 0..($uniqueGuids.Count - 1)) { + $guidHex = $uniqueGuids[$i] + $guidHyphenated = ConvertTo-HyphenatedGuid -Hex32 $guidHex + if (-not $guidHyphenated) { continue } + $url = "$baseUrl/$guidHyphenated" + try { + $response = Invoke-WebRequest -Uri $url -UseBasicParsing -TimeoutSec 15 -ErrorAction Stop + $html = $response.Content + if ($html -match ']*>\s*(\d+(?:\.\d+)*)\s+' -or $html -match '(?:^|\n)#\s+(\d+(?:\.\d+)*)\s+') { + $sectionNum = $Matches[1] + $map[$guidHex] = "Section_$sectionNum" + $resolved++ + } + } + catch { + Write-Verbose "Failed to fetch $url : $_" + } + if ($ThrottleSeconds -gt 0 -and $i -lt $uniqueGuids.Count - 1) { + Start-Sleep -Seconds $ThrottleSeconds + } + } + + Write-Verbose "Resolved $resolved / $total GUIDs from Learn" + $map +} diff --git a/AwakeCoding.OpenSpecs/Private/Invoke-OpenSpecMarkdownCleanup.ps1 b/AwakeCoding.OpenSpecs/Private/Invoke-OpenSpecMarkdownCleanup.ps1 index 8617fea9..40d56686 100644 --- a/AwakeCoding.OpenSpecs/Private/Invoke-OpenSpecMarkdownCleanup.ps1 +++ b/AwakeCoding.OpenSpecs/Private/Invoke-OpenSpecMarkdownCleanup.ps1 @@ -5,12 +5,49 @@ function Invoke-OpenSpecMarkdownCleanup { [string]$Markdown, [Parameter(Mandatory)] - [string]$CurrentProtocolId + [string]$CurrentProtocolId, + + [Parameter()] + [object]$SourceLinkMetadata, + + [switch]$RemoveDocumentIndex = $true ) $issues = New-Object System.Collections.Generic.List[object] $result = $Markdown + if ($RemoveDocumentIndex) { + $indexResult = Remove-OpenSpecDocumentIndex -Markdown $result + $result = $indexResult.Markdown + if ($indexResult.Removed) { + [void]$issues.Add([pscustomobject]@{ + Type = 'DocumentIndexRemoved' + Severity = 'Info' + Reason = 'Back-of-document index section was removed (page numbers are not meaningful in Markdown).' + }) + } + } + + $titleResult = Set-OpenSpecDocumentTitle -Markdown $result -CurrentProtocolId $CurrentProtocolId + $result = $titleResult.Markdown + if ($titleResult.Normalized) { + [void]$issues.Add([pscustomobject]@{ + Type = 'DocumentTitleNormalized' + Severity = 'Info' + Reason = 'Document title was normalized to a single H1 heading.' + }) + } + + $frontMatterResult = Remove-OpenSpecFrontMatterBoilerplate -Markdown $result + $result = $frontMatterResult.Markdown + if ($frontMatterResult.Removed) { + [void]$issues.Add([pscustomobject]@{ + Type = 'FrontMatterBoilerplateRemoved' + Severity = 'Info' + Reason = 'IP notice, revision history, and support boilerplate were removed after the title; last updated date retained when present.' + }) + } + $tableResult = ConvertFrom-OpenSpecHtmlTables -Markdown $result $result = $tableResult.Markdown foreach ($issue in $tableResult.Issues) { [void]$issues.Add($issue) } @@ -31,16 +68,84 @@ function Invoke-OpenSpecMarkdownCleanup { $result = $tocResult.Markdown foreach ($issue in $tocResult.Issues) { [void]$issues.Add($issue) } - $guidResult = Resolve-OpenSpecGuidSectionAnchors -Markdown $result + $sourceGuidToSection = if ($SourceLinkMetadata -and $SourceLinkMetadata.PSObject.Properties['GuidToSection']) { $SourceLinkMetadata.GuidToSection } else { $null } + $guidResult = Resolve-OpenSpecGuidSectionAnchors -Markdown $result -GuidToSectionMap $sourceGuidToSection $result = $guidResult.Markdown foreach ($issue in $guidResult.Issues) { [void]$issues.Add($issue) } + $crossSpecResult = Repair-OpenSpecCrossSpecLinks -Markdown $result -CurrentProtocolId $CurrentProtocolId + $result = $crossSpecResult.Markdown + foreach ($issue in $crossSpecResult.Issues) { [void]$issues.Add($issue) } + + $sectionNumResult = Repair-OpenSpecSectionNumberLinks -Markdown $result + $result = $sectionNumResult.Markdown + foreach ($issue in $sectionNumResult.Issues) { [void]$issues.Add($issue) } + $mathResult = ConvertTo-OpenSpecNormalizedMathText -Markdown $result $result = $mathResult.Markdown foreach ($issue in $mathResult.Issues) { [void]$issues.Add($issue) } $result = Convert-OpenSpecInlineHtmlToMarkdown -Text $result $result = Remove-OpenSpecStandaloneTableTagLines -Text $result + + $anchorResult = Add-OpenSpecSectionAnchors -Markdown $result + $result = $anchorResult.Markdown + if ($anchorResult.InjectedCount -gt 0) { + [void]$issues.Add([pscustomobject]@{ + Type = 'SectionAnchorsInjected' + Severity = 'Info' + Count = $anchorResult.InjectedCount + Reason = 'Section anchor tags were added so TOC and in-document links resolve correctly.' + }) + } + + $tocAnchorResult = Add-OpenSpecMissingSectionAnchorsFromToc -Markdown $result + $result = $tocAnchorResult.Markdown + if ($tocAnchorResult.InjectedCount -gt 0) { + [void]$issues.Add([pscustomobject]@{ + Type = 'MissingSectionAnchorsFromToc' + Severity = 'Info' + Count = $tocAnchorResult.InjectedCount + Reason = 'Missing section anchors were injected using TOC titles so linked section numbers resolve.' + }) + } + + $sourceSectionToTitle = if ($SourceLinkMetadata -and $SourceLinkMetadata.PSObject.Properties['SectionToTitle']) { $SourceLinkMetadata.SectionToTitle } else { $null } + $guidByHeadingResult = Repair-OpenSpecSectionGuidLinksByHeadingMatch -Markdown $result -SectionToTitleMap $sourceSectionToTitle + $result = $guidByHeadingResult.Markdown + if ($guidByHeadingResult.LinksRepaired -gt 0) { + [void]$issues.Add([pscustomobject]@{ + Type = 'SectionGuidLinksRepairedByHeading' + Severity = 'Info' + Count = $guidByHeadingResult.LinksRepaired + Reason = 'Section GUID links were rewritten to section numbers by matching link text to headings.' + }) + } + + $sourceGuidToGlossarySlug = if ($SourceLinkMetadata -and $SourceLinkMetadata.PSObject.Properties['GuidToGlossarySlug']) { $SourceLinkMetadata.GuidToGlossarySlug } else { $null } + $glossaryResult = Add-OpenSpecGlossaryAnchorsAndRepairLinks -Markdown $result -GuidToGlossarySlugMap $sourceGuidToGlossarySlug + $result = $glossaryResult.Markdown + if ($glossaryResult.AnchorsInjected -gt 0 -or $glossaryResult.LinksRepaired -gt 0) { + [void]$issues.Add([pscustomobject]@{ + Type = 'GlossaryAnchorsAndLinks' + Severity = 'Info' + AnchorsInjected = $glossaryResult.AnchorsInjected + LinksRepaired = $glossaryResult.LinksRepaired + SourceMapLinksRepaired = if ($glossaryResult.PSObject.Properties['SourceMapLinksRepaired']) { $glossaryResult.SourceMapLinksRepaired } else { 0 } + Reason = 'Glossary term anchors were added and #gt_ links were rewritten so they resolve.' + }) + } + + $tocGitHubResult = ConvertTo-OpenSpecGitHubFriendlyToc -Markdown $result + $result = $tocGitHubResult.Markdown + if ($tocGitHubResult.Rewritten) { + [void]$issues.Add([pscustomobject]@{ + Type = 'TocGitHubFriendly' + Severity = 'Info' + Reason = 'Table of contents was rewritten as collapsible sections for better GitHub rendering.' + }) + } + $newLine = [Environment]::NewLine $result = [regex]::Replace($result, "(`r?`n){3,}", "$newLine$newLine") @@ -739,6 +844,97 @@ function ConvertTo-OpenSpecNormalizedTocLinks { } } +function ConvertTo-OpenSpecGitHubFriendlyToc { + [CmdletBinding()] + param( + [Parameter(Mandatory)] + [string]$Markdown + ) + + $newLine = [Environment]::NewLine + $tocLineRegex = [regex]::new('^\s*\[(?\d+(?:\.\d+)*)\s+(?[^\]]*)\]\(#Section_(?<sec>\d+(?:\.\d+)*)\)\s*$') + $lines = $Markdown -split '\r?\n' + $tocTitleIndex = -1 + for ($i = 0; $i -lt $lines.Count; $i++) { + if ($lines[$i].Trim() -ceq 'Table of Contents') { + $tocTitleIndex = $i + break + } + } + if ($tocTitleIndex -lt 0) { + return [pscustomobject]@{ Markdown = $Markdown; Rewritten = $false } + } + $tocEndIndex = -1 + for ($i = $tocTitleIndex + 1; $i -lt $lines.Count; $i++) { + if ($tocLineRegex.IsMatch($lines[$i])) { + $tocEndIndex = $i + } elseif ($lines[$i].Trim() -ne '' -and $tocEndIndex -ge 0) { + break + } + } + if ($tocEndIndex -lt $tocTitleIndex) { + return [pscustomobject]@{ Markdown = $Markdown; Rewritten = $false } + } + + $entries = [System.Collections.Generic.List[object]]::new() + for ($i = $tocTitleIndex + 1; $i -le $tocEndIndex; $i++) { + $line = $lines[$i] + $m = $tocLineRegex.Match($line) + if ($m.Success) { + [void]$entries.Add([pscustomobject]@{ + SectionNum = $m.Groups['num'].Value + Title = $m.Groups['title'].Value.Trim() + FullLink = $line.Trim() + }) + } + } + if ($entries.Count -eq 0) { + return [pscustomobject]@{ Markdown = $Markdown; Rewritten = $false } + } + + $topLevelToTitle = @{} + foreach ($e in $entries) { + $first = $e.SectionNum -replace '\..*$', '' + if (-not $topLevelToTitle.ContainsKey($first)) { + $topLevelToTitle[$first] = $e.Title + } + } + $groups = @{} + foreach ($e in $entries) { + $first = $e.SectionNum -replace '\..*$', '' + if (-not $groups.ContainsKey($first)) { + $groups[$first] = [System.Collections.Generic.List[object]]::new() + } + [void]$groups[$first].Add($e) + } + $sb = [System.Text.StringBuilder]::new() + [void]$sb.AppendLine('Table of Contents') + [void]$sb.AppendLine() + $firstKeys = $groups.Keys | Sort-Object { [int]$_ } + foreach ($key in $firstKeys) { + $title = $topLevelToTitle[$key] + [void]$sb.AppendLine('<details>') + [void]$sb.AppendLine("<summary>$key $title</summary>") + [void]$sb.AppendLine() + foreach ($e in $groups[$key]) { + $indent = ' ' * (($e.SectionNum -split '\.').Count - 1) + [void]$sb.AppendLine("$indent- $($e.FullLink)") + } + [void]$sb.AppendLine('</details>') + [void]$sb.AppendLine() + } + $newToc = $sb.ToString().TrimEnd($newLine.ToCharArray()) + $before = ($lines[0..($tocTitleIndex - 1)] -join $newLine).TrimEnd() + $afterStart = $tocEndIndex + 1 + $after = if ($afterStart -lt $lines.Count) { $newLine + ($lines[$afterStart..($lines.Count - 1)] -join $newLine) } else { '' } + $result = $before + $newLine + $newLine + $newToc + $after + + [pscustomobject]@{ + Markdown = $result + Rewritten = $true + } +} + function ConvertTo-OpenSpecNormalizedEncodedBracketUrls { [CmdletBinding()] param( @@ -825,7 +1021,10 @@ function Resolve-OpenSpecGuidSectionAnchors { [CmdletBinding()] param( [Parameter(Mandatory)] - [string]$Markdown + [string]$Markdown, + + [Parameter()] + [object]$GuidToSectionMap ) $issues = New-Object System.Collections.Generic.List[object] @@ -843,6 +1042,20 @@ function Resolve-OpenSpecGuidSectionAnchors { # lowercase "section_" while the hyperlink uses "Section_"). Replacing # these with the Section_X.Y.Z form fixes both issues. $guidToSection = @{} + $sourceMapCount = 0 + if ($GuidToSectionMap) { + foreach ($entry in $GuidToSectionMap.GetEnumerator()) { + $guid = ([string]$entry.Key).ToLowerInvariant() + $section = [string]$entry.Value + if ([string]::IsNullOrWhiteSpace($guid) -or [string]::IsNullOrWhiteSpace($section)) { + continue + } + if (-not $guidToSection.ContainsKey($guid)) { + $guidToSection[$guid] = $section + $sourceMapCount++ + } + } + } # Order 1: GUID anchor followed by Section anchor (most common) $pairRegex1 = [regex]::new( @@ -901,6 +1114,7 @@ function Resolve-OpenSpecGuidSectionAnchors { Severity = 'Info' Count = $rewriteCount MappedAnchors = $guidToSection.Count + SourceMappedAnchors = $sourceMapCount Reason = 'GUID-based section anchors were resolved to section number anchors.' }) } @@ -911,6 +1125,105 @@ function Resolve-OpenSpecGuidSectionAnchors { } } +function Repair-OpenSpecCrossSpecLinks { + [CmdletBinding()] + param( + [Parameter(Mandatory)] + [string]$Markdown, + + [Parameter(Mandatory)] + [string]$CurrentProtocolId + ) + + $issues = New-Object System.Collections.Generic.List[object] + $result = $Markdown + $rewriteCount = 0 + + # Links like ](#Section_<32hex>) are Word bookmark IDs. When they point to another + # spec (cross-reference), the GUID is not in the current document, so they break. + # Rewrite them to ](../ProtocolId/ProtocolId.md) using [MS-XXX] from the link text + # or from the same line (e.g. References: "[MS-RDPBCGR] ... \"[Title](#Section_guid)\""). + $pattern = '\[([^\]]+)\]\(#Section_([a-f0-9]{32})\)' + $matches = [regex]::Matches($result, $pattern) + $currentIdUpper = $CurrentProtocolId.ToUpperInvariant() + + foreach ($m in ($matches | Sort-Object -Property { $_.Index } -Descending)) { + $linkText = $m.Groups[1].Value + $nlIdx = $result.LastIndexOf("`n", [Math]::Min($m.Index, $result.Length - 1)) + $lineStart = if ($nlIdx -ge 0) { $nlIdx + 1 } else { 0 } + $lineEndIdx = $result.IndexOf("`n", $m.Index) + $lineEnd = if ($lineEndIdx -ge 0) { $lineEndIdx } else { $result.Length } + $line = $result.Substring($lineStart, $lineEnd - $lineStart) + + $protocolId = $null + if ($linkText -match '^(MS|MC)-[A-Z0-9\-]+$') { + $protocolId = $linkText + } + elseif ($line -match '\[(MS-[A-Z0-9\-]+|MC-[A-Z0-9\-]+)\]') { + $protocolId = $Matches[1] + } + + if ($protocolId -and $protocolId.ToUpperInvariant() -ne $currentIdUpper) { + $replacement = "[$linkText](../$protocolId/$protocolId.md)" + $result = $result.Substring(0, $m.Index) + $replacement + $result.Substring($m.Index + $m.Length) + $rewriteCount++ + } + } + + if ($rewriteCount -gt 0) { + [void]$issues.Add([pscustomobject]@{ + Type = 'CrossSpecLinksRepaired' + Severity = 'Info' + Count = $rewriteCount + Reason = 'Cross-spec references (GUID anchors) were rewritten to relative spec paths.' + }) + } + + [pscustomobject]@{ + Markdown = $result + Issues = $issues.ToArray() + } +} + +function Repair-OpenSpecSectionNumberLinks { + [CmdletBinding()] + param( + [Parameter(Mandatory)] + [string]$Markdown + ) + + $issues = New-Object System.Collections.Generic.List[object] + $result = $Markdown + + # In-document links like [5.3.8](#Section_guid) often have no guid->section mapping + # (Word bookmark pair missing in converted output). When the link text is a section + # number, rewrite to [5.3.8](#Section_5.3.8) so they resolve to our injected anchors. + $pattern = [regex]::new( + '\[(?<num>\d+(?:\.\d+)*)\]\(#Section_[a-f0-9]{32}\)', + [System.Text.RegularExpressions.RegexOptions]::IgnoreCase + ) + $rewriteCount = $pattern.Matches($result).Count + $result = $pattern.Replace($result, { + param($m) + $num = $m.Groups['num'].Value + "[$num](#Section_$num)" + }) + + if ($rewriteCount -gt 0) { + [void]$issues.Add([pscustomobject]@{ + Type = 'SectionNumberLinksRepaired' + Severity = 'Info' + Count = $rewriteCount + Reason = 'In-document section links (GUID anchors) were rewritten to section number anchors.' + }) + } + + [pscustomobject]@{ + Markdown = $result + Issues = $issues.ToArray() + } +} + function Resolve-OpenSpecLinkTarget { [CmdletBinding()] param( @@ -998,3 +1311,499 @@ function Remove-OpenSpecStandaloneTableTagLines { return $result } + +function Remove-OpenSpecDocumentIndex { + [CmdletBinding()] + param( + [Parameter(Mandatory)] + [string]$Markdown + ) + + $result = $Markdown + $removed = $false + + # Match the back-of-document index section: heading "# N Index" (e.g. "# 8 Index", "# 9 Index"). + # Do not match "Index of Security Parameters" or other subsections. + $indexHeadingRegex = [regex]::new('(?m)^# \d+ Index\s*$') + $match = $indexHeadingRegex.Match($result) + if ($match.Success) { + $result = $result.Substring(0, $match.Index).TrimEnd() + $removed = $true + + # Remove any trailing anchor line(s) that only served the index heading (optional). + $trailingAnchorRegex = [regex]::new('(?ms)(\r?\n)(<a\s+id="[^"]+"></a>\s*)+$') + $result = $trailingAnchorRegex.Replace($result, '') + + # Remove the "N Index" TOC entry so we don't leave a dead link. + $result = [regex]::Replace($result, '(?m)^\s*\[\d+ Index\]\(#Section_\d+\)\s*\r?\n', '') + } + + [pscustomobject]@{ + Markdown = $result + Removed = $removed + } +} + +function Set-OpenSpecDocumentTitle { + [CmdletBinding()] + param( + [Parameter(Mandatory)] + [string]$Markdown, + + [Parameter(Mandatory)] + [string]$CurrentProtocolId + ) + + $result = $Markdown + $normalized = $false + + # Replace leading "**[MS-XXX]:**\n\n**Full Title**" with a single "# [MS-XXX]: Full Title" H1. + $escapedId = [regex]::Escape($CurrentProtocolId) + $titlePattern = [regex]::new( + '^\s*\*\*(?:\[' + $escapedId + '\]|' + $escapedId + ')\s*:\s*\*\*\s*\r?\n\r?\n\*\*(?<title>[^*]+)\*\*', + [System.Text.RegularExpressions.RegexOptions]::Multiline + ) + $match = $titlePattern.Match($result) + if ($match.Success) { + $title = $match.Groups['title'].Value.Trim() + $replacement = "# [$CurrentProtocolId]: $title" + $result = $result.Substring(0, $match.Index) + $replacement + $result.Substring($match.Index + $match.Length) + $normalized = $true + } + + [pscustomobject]@{ + Markdown = $result + Normalized = $normalized + } +} + +function Remove-OpenSpecFrontMatterBoilerplate { + [CmdletBinding()] + param( + [Parameter(Mandatory)] + [string]$Markdown + ) + + $result = $Markdown + $removed = $false + $newLine = [Environment]::NewLine + + # Block from "Intellectual Property Rights Notice" (or similar) through the revision table, ending before "Table of Contents". + $blockRegex = [regex]::new( + '(?s)(\r?\n)(Intellectual Property Rights Notice.*?)(\r?\n\r?\n)(Table of Contents)', + [System.Text.RegularExpressions.RegexOptions]::IgnoreCase + ) + $match = $blockRegex.Match($result) + if ($match.Success) { + $blockContent = $match.Groups[2].Value + $lastUpdated = $null + $dateRowRegex = [regex]::new('\|\s*(\d{1,2}/\d{1,2}/\d{4})\s*\|') + $dateMatches = $dateRowRegex.Matches($blockContent) + if ($dateMatches.Count -gt 0) { + $lastMatch = $dateMatches[$dateMatches.Count - 1] + $lastUpdated = $lastMatch.Groups[1].Value + } + $replacement = $match.Groups[1].Value + if ($lastUpdated) { + $replacement += "Last updated: $lastUpdated" + $newLine + $newLine + } else { + $replacement += $match.Groups[3].Value + } + $replacement += $match.Groups[4].Value + $result = $result.Substring(0, $match.Index) + $replacement + $result.Substring($match.Index + $match.Length) + $removed = $true + } + + [pscustomobject]@{ + Markdown = $result + Removed = $removed + } +} + +function Add-OpenSpecSectionAnchors { + [CmdletBinding()] + param( + [Parameter(Mandatory)] + [string]$Markdown + ) + + $newLine = [Environment]::NewLine + $lines = [System.Collections.Generic.List[string]]::new() + $injectedCount = 0 + + # Heading pattern: optional leading whitespace, 1-6 hashes, space, section number (e.g. 1, 1.1, 2.2.2.2.1.1.1), space, rest. + $headingRegex = [regex]::new('^\s*(#{1,6})\s+(\d+(?:\.\d+)*)\s+(.+)$') + + $i = 0 + $lineArray = $Markdown -split '\r?\n' + while ($i -lt $lineArray.Count) { + $line = $lineArray[$i] + $headingMatch = $headingRegex.Match($line) + if ($headingMatch.Success) { + $sectionNum = $headingMatch.Groups[2].Value + $anchorId = "Section_$sectionNum" + $anchorLine = "<a id=`"$anchorId`"></a>" + $prevLine = if ($lines.Count -gt 0) { $lines[$lines.Count - 1].Trim() } else { '' } + if ($prevLine -ne $anchorLine) { + [void]$lines.Add($anchorLine) + $injectedCount++ + } + } + [void]$lines.Add($line) + $i++ + } + + [pscustomobject]@{ + Markdown = $lines -join $newLine + InjectedCount = $injectedCount + } +} + +function Add-OpenSpecMissingSectionAnchorsFromToc { + [CmdletBinding()] + param( + [Parameter(Mandatory)] + [string]$Markdown + ) + + $newLine = [Environment]::NewLine + $lines = [System.Collections.Generic.List[string]]::new($Markdown -split '\r?\n') + $injectedCount = 0 + + # Collect (sectionNum, title) from TOC lines: [N.N Title](#Section_N.N) + $tocEntryRegex = [regex]::new('^\s*\[(?<num>\d+(?:\.\d+)*)\s+(?<title>[^\]]*)\]\(#Section_(?<sec>\d+(?:\.\d+)*)\)\s*$') + $tocEntries = [System.Collections.Generic.List[object]]::new() + foreach ($line in $lines) { + $m = $tocEntryRegex.Match($line) + if ($m.Success -and $m.Groups['num'].Value -eq $m.Groups['sec'].Value) { + [void]$tocEntries.Add([pscustomobject]@{ SectionNum = $m.Groups['num'].Value; Title = $m.Groups['title'].Value.Trim() }) + } + } + + # Which Section_N.N anchors already exist? + $existingAnchors = [System.Collections.Generic.HashSet[string]]::new([StringComparer]::OrdinalIgnoreCase) + foreach ($line in $lines) { + if ($line -match '^\s*<a\s+id="(Section_\d+(?:\.\d+)*)"\s*></a>\s*$') { + [void]$existingAnchors.Add($Matches[1]) + } + } + + # Missing: (sectionNum, title) from TOC where anchor is missing. Keep TOC order. + $missingList = [System.Collections.Generic.List[object]]::new() + $seen = [System.Collections.Generic.HashSet[string]]::new() + foreach ($e in $tocEntries) { + $id = "Section_$($e.SectionNum)" + if (-not $existingAnchors.Contains($id) -and -not $seen.Contains($id)) { + [void]$seen.Add($id) + [void]$missingList.Add([pscustomobject]@{ SectionNum = $e.SectionNum; Title = $e.Title }) + } + } + if ($missingList.Count -eq 0) { + return [pscustomobject]@{ Markdown = $Markdown; InjectedCount = 0 } + } + + # Assign each missing section to the first line (in doc order) that matches its title. + # Prefer heading lines that contain the title; else use a non-heading line that equals the title exactly. + $lineIndexToSection = @{} + $assignedLines = [System.Collections.Generic.HashSet[int]]::new() + foreach ($entry in $missingList) { + $title = $entry.Title + if ([string]::IsNullOrWhiteSpace($title)) { continue } + $found = $false + for ($i = 0; $i -lt $lines.Count -and -not $found; $i++) { + if ($assignedLines.Contains($i)) { continue } + $line = $lines[$i] + $lineTrim = $line.Trim() + $isHeading = $line -match '^\s*#{1,6}\s+(.+)$' + $content = if ($isHeading) { $Matches[1].Trim() } else { $lineTrim } + $matchesTitle = $content -like "*$title*" + $exactMatch = $content -ceq $title + if (-not $matchesTitle -and -not $exactMatch) { continue } + if (-not $isHeading -and -not $exactMatch) { continue } + $lineIndexToSection[$i] = $entry.SectionNum + [void]$assignedLines.Add($i) + $found = $true + } + } + + # Insert anchors in reverse line order so indices stay valid. + foreach ($idx in ($lineIndexToSection.Keys | Sort-Object -Descending)) { + $sectionNum = $lineIndexToSection[$idx] + $anchorId = "Section_$sectionNum" + $lines.Insert($idx, "<a id=`"$anchorId`"></a>") + $injectedCount++ + } + + [pscustomobject]@{ + Markdown = $lines -join $newLine + InjectedCount = $injectedCount + } +} + +function Repair-OpenSpecSectionGuidLinksByHeadingMatch { + [CmdletBinding()] + param( + [Parameter(Mandatory)] + [string]$Markdown, + + [Parameter()] + [object]$SectionToTitleMap + ) + + $newLine = [Environment]::NewLine + $lineArray = $Markdown -split '\r?\n' + $titleToSection = @{} + $anchorIdRegex = [regex]::new('<a\s+id="([^"]+)"\s*></a>', 'IgnoreCase') + + # Collect all existing anchors + $existingAnchors = [System.Collections.Generic.HashSet[string]]::new([StringComparer]::OrdinalIgnoreCase) + foreach ($m in $anchorIdRegex.Matches($Markdown)) { + [void]$existingAnchors.Add($m.Groups[1].Value) + } + + if ($SectionToTitleMap) { + foreach ($entry in $SectionToTitleMap.GetEnumerator()) { + $sectionId = [string]$entry.Key + $title = [string]$entry.Value + if ([string]::IsNullOrWhiteSpace($sectionId) -or [string]::IsNullOrWhiteSpace($title)) { continue } + $norm = ($title -replace '\s+', ' ').Trim() + if (-not $titleToSection.ContainsKey($norm)) { $titleToSection[$norm] = $sectionId } + $withoutNum = $title -replace '^\d+(?:\.\d+)*\s+', '' + $normWithout = ($withoutNum -replace '\s+', ' ').Trim() + if ($normWithout -and -not $titleToSection.ContainsKey($normWithout)) { $titleToSection[$normWithout] = $sectionId } + } + } + + # From lines with existing Section_N.N anchors + following line (heading or plain title) + for ($i = 0; $i -lt $lineArray.Count; $i++) { + $line = $lineArray[$i] + if ($line -match '^\s*<a\s+id="(Section_\d+(?:\.\d+)*)"\s*></a>\s*$') { + $sectionId = $Matches[1] + $nextLine = if ($i + 1 -lt $lineArray.Count) { $lineArray[$i + 1].Trim() } else { '' } + if ([string]::IsNullOrWhiteSpace($nextLine)) { continue } + $title = if ($nextLine -match '^\s*#{1,6}\s+(?<title>.+)$') { $Matches['title'].Trim() } else { $nextLine } + $norm = ($title -replace '\s+', ' ').Trim() + if (-not $titleToSection.ContainsKey($norm)) { $titleToSection[$norm] = $sectionId } + $withoutNum = $title -replace '^\d+(?:\.\d+)*\s+', '' + $normWithout = ($withoutNum -replace '\s+', ' ').Trim() + if ($normWithout -and -not $titleToSection.ContainsKey($normWithout)) { $titleToSection[$normWithout] = $sectionId } + $withoutParen = $title -replace '\s*\([^)]*\)\s*$', '' # "Share Control Header (TS_SHARECONTROLHEADER)" -> "Share Control Header" + $normNoParen = ($withoutParen -replace '\s+', ' ').Trim() + if ($normNoParen -and -not $titleToSection.ContainsKey($normNoParen)) { $titleToSection[$normNoParen] = $sectionId } + } + } + + # From ALL headings that start with section number (e.g. ## 2.2.8.1.1.1 Share Control Header) + $headingNumRegex = [regex]::new('^\s*#{1,6}\s+(\d+(?:\.\d+)*)\s+(?<title>.+)$') + for ($i = 0; $i -lt $lineArray.Count; $i++) { + $line = $lineArray[$i] + $hm = $headingNumRegex.Match($line) + if ($hm.Success) { + $sectionNum = $hm.Groups[1].Value + $sectionId = "Section_$sectionNum" + $title = $hm.Groups['title'].Value.Trim() + $norm = ($title -replace '\s+', ' ').Trim() + if (-not $titleToSection.ContainsKey($norm)) { $titleToSection[$norm] = $sectionId } + $withoutNum = ($title -replace '^\d+(?:\.\d+)*\s+', '') -replace '\s*\([^)]*\)\s*$', '' + $normWithout = ($withoutNum -replace '\s+', ' ').Trim() + if ($normWithout -and -not $titleToSection.ContainsKey($normWithout)) { $titleToSection[$normWithout] = $sectionId } + } + } + + # Find best section for link text: exact match, prefix match, or extract "(section N.N.N)" from link text. + $findSectionForLinkText = { + param($norm, $titleToSection, $existingAnchors) + if ($titleToSection.ContainsKey($norm)) { return $titleToSection[$norm] } + # Extract section number from link text like "Share Control Header (section 2.2.8.1.1.1)" + if ($norm -match '\(section\s+(\d+(?:\.\d+)*)\)') { + $extractedId = "Section_$($Matches[1])" + if ($existingAnchors.Contains($extractedId)) { return $extractedId } + } + $candidates = @() + foreach ($key in $titleToSection.Keys) { + if ($key -eq $norm) { return $titleToSection[$key] } + if ($key.StartsWith($norm + ' ') -or $key.StartsWith($norm + '(')) { $candidates += [pscustomobject]@{ Key = $key; SectionId = $titleToSection[$key] } } + elseif ($norm.StartsWith($key + ' ') -or $norm.StartsWith($key + '(')) { $candidates += [pscustomobject]@{ Key = $key; SectionId = $titleToSection[$key] } } + elseif ($key.StartsWith($norm) -or $norm.StartsWith($key)) { $candidates += [pscustomobject]@{ Key = $key; SectionId = $titleToSection[$key] } } + } + if ($candidates.Count -eq 1) { return $candidates[0].SectionId } + if ($candidates.Count -gt 1) { + # Prefer shortest key (most specific match), e.g. "Status Info PDU" over "Status Info PDU Data (TS_...)" + $best = $candidates | Sort-Object -Property { $_.Key.Length } | Select-Object -First 1 + return $best.SectionId + } + return $null + } + $guidLinkRegex = [regex]::new('\[(?<text>[^\]]+)\]\(#Section_[a-fA-F0-9]{32}\)') + $result = $guidLinkRegex.Replace($Markdown, { + param($m) + $rawText = $m.Groups['text'].Value + $norm = ($rawText -replace '\*+', '' -replace '\s+', ' ').Trim() + $sectionId = & $findSectionForLinkText $norm $titleToSection $existingAnchors + if (-not $sectionId -and $rawText.Trim() -ne $norm) { $sectionId = & $findSectionForLinkText $rawText.Trim() $titleToSection $existingAnchors } + if ($sectionId) { "[$rawText](#$sectionId)" } else { $m.Value } + }) + $linksRepaired = 0 + foreach ($m in $guidLinkRegex.Matches($Markdown)) { + $norm = ($m.Groups['text'].Value -replace '\*+', '' -replace '\s+', ' ').Trim() + $sid = & $findSectionForLinkText $norm $titleToSection $existingAnchors + if (-not $sid) { $sid = & $findSectionForLinkText $m.Groups['text'].Value.Trim() $titleToSection $existingAnchors } + if ($sid) { $linksRepaired++ } + } + + [pscustomobject]@{ + Markdown = $result + LinksRepaired = $linksRepaired + } +} + +function Add-OpenSpecGlossaryAnchorsAndRepairLinks { + [CmdletBinding()] + param( + [Parameter(Mandatory)] + [string]$Markdown, + + [Parameter()] + [object]$GuidToGlossarySlugMap + ) + + $newLine = [Environment]::NewLine + $lineArray = [System.Collections.Generic.List[string]]::new($Markdown -split '\r?\n') + $termToSlug = @{} + $insertedSlugs = @{} + $injectedCount = 0 + + # Find the Glossary section: heading like "1.1 Glossary" or "## 1.1 Glossary". + $glossaryHeadingRegex = [regex]::new('^\s*(#{1,6})\s+(?<num>\d+(?:\.\d+)*)\s+Glossary\s*$') + $anyHeadingRegex = [regex]::new('^\s*(#+)\s+.+$') + $glossaryDefRegex = [regex]::new('^\s*\*\*(?<term>[^*]+)\*\*\s*:\s*') + + $i = 0 + $inGlossary = $false + $glossaryLevel = 0 + while ($i -lt $lineArray.Count) { + $line = $lineArray[$i] + $headMatch = $glossaryHeadingRegex.Match($line) + if ($headMatch.Success) { + $inGlossary = $true + $glossaryLevel = $headMatch.Groups[1].Value.Length + $i++ + continue + } + if ($inGlossary) { + $headOnly = $anyHeadingRegex.Match($line) + if ($headOnly.Success -and $headOnly.Groups[1].Value.Length -le $glossaryLevel) { + $inGlossary = $false + } + } + if ($inGlossary) { + $defMatch = $glossaryDefRegex.Match($line) + if ($defMatch.Success) { + $term = $defMatch.Groups['term'].Value.Trim() + $slug = $term -replace '\s+', '-' -replace '[^\w\-]', '' -replace '-+', '-' -replace '^-|-$', '' + $slug = $slug.ToLowerInvariant() + if ([string]::IsNullOrWhiteSpace($slug)) { $slug = "term-$i" } + $slug = "gt_$slug" + $prevLine = if ($i -gt 0) { $lineArray[$i - 1].Trim() } else { '' } + $alreadyHasAnchor = $prevLine -match ('^\s*<a\s+id="' + [regex]::Escape($slug) + '"\s*></a>\s*$') + if (-not $insertedSlugs.ContainsKey($slug) -and -not $alreadyHasAnchor) { + $insertedSlugs[$slug] = $true + $anchorLine = "<a id=`"$slug`"></a>" + $lineArray.Insert($i, $anchorLine) + $injectedCount++ + $i++ + } + $normalizedTerm = $term.Trim() + $termToSlug[$normalizedTerm] = $slug + if ($term -match '^(.+?)\s+\(([^)]+)\)\s*$') { + $abbrev = $Matches[2].Trim() + $termBeforeParen = $Matches[1].Trim() + $termToSlug[$abbrev] = $slug + $termToSlug[$termBeforeParen] = $slug + if ($abbrev.Length -gt 0 -and -not $abbrev.EndsWith('s')) { + $termToSlug["$abbrev`s"] = $slug + } + # Plural phrasing used in body links: "Message Authentication Codes (MAC)", "input method editors (IMEs)", "Multipoint Communication Services (MCS)". + if (-not $termBeforeParen.EndsWith('s')) { + $termToSlug["$termBeforeParen`s ($abbrev)"] = $slug + $abbrevPlural = if ($abbrev.EndsWith('s')) { $abbrev } else { "$abbrev`s" } + $termToSlug["$termBeforeParen`s ($abbrevPlural)"] = $slug + } + } + if ($normalizedTerm.EndsWith('s') -eq $false -and $normalizedTerm.Length -gt 1) { + $termToSlug["$normalizedTerm`s"] = $slug + } + } + } + $i++ + } + + $result = $lineArray -join $newLine + + # Rewrite [text](#gt_guid) to [text](#gt_slug) using source map first (deterministic), then link text -> slug map. + $linkRegex = [regex]::new('\[(?<text>[^\]]+)\]\(#gt_(?<guid>[a-f0-9\-]{36})\)') + $linksRepaired = 0 + $sourceGuidToSlug = @{} + if ($GuidToGlossarySlugMap) { + foreach ($entry in $GuidToGlossarySlugMap.GetEnumerator()) { + $guid = ([string]$entry.Key).ToLowerInvariant() + $slug = [string]$entry.Value + if ([string]::IsNullOrWhiteSpace($guid) -or [string]::IsNullOrWhiteSpace($slug)) { continue } + if (-not $sourceGuidToSlug.ContainsKey($guid)) { $sourceGuidToSlug[$guid] = $slug } + } + } + $matchesBeforeRewrite = $linkRegex.Matches($result) + # Case-insensitive fallback: build lower-key map so link text "RSA" / "rsa" resolve when abbrev is "RSA". + $slugByLower = @{} + foreach ($k in $termToSlug.Keys) { + $lower = $k.ToLowerInvariant() + if (-not $slugByLower.ContainsKey($lower)) { $slugByLower[$lower] = $termToSlug[$k] } + } + $result = $linkRegex.Replace($result, { + param($m) + $rawText = $m.Groups['text'].Value + $normalized = ($rawText -replace '\*+', '').Trim() + $guid = $m.Groups['guid'].Value.ToLowerInvariant() + $slug = $null + if ($sourceGuidToSlug.ContainsKey($guid)) { + $slug = $sourceGuidToSlug[$guid] + } + elseif ($termToSlug.ContainsKey($normalized)) { + $slug = $termToSlug[$normalized] + } + elseif ($termToSlug.ContainsKey($rawText.Trim())) { + $slug = $termToSlug[$rawText.Trim()] + } + elseif ($slugByLower.ContainsKey($normalized.ToLowerInvariant())) { + $slug = $slugByLower[$normalized.ToLowerInvariant()] + } + if ($slug) { + "[$rawText](#$slug)" + } + else { + $m.Value + } + }) + + foreach ($match in $matchesBeforeRewrite) { + $guid = $match.Groups['guid'].Value.ToLowerInvariant() + $norm = ($match.Groups['text'].Value -replace '\*+', '').Trim() + if ($sourceGuidToSlug.ContainsKey($guid) -or $termToSlug.ContainsKey($norm) -or $termToSlug.ContainsKey($match.Groups['text'].Value.Trim()) -or $slugByLower.ContainsKey($norm.ToLowerInvariant())) { + $linksRepaired++ + } + } + $sourceMapLinksRepaired = 0 + foreach ($match in $matchesBeforeRewrite) { + $guid = $match.Groups['guid'].Value.ToLowerInvariant() + if ($sourceGuidToSlug.ContainsKey($guid)) { + $sourceMapLinksRepaired++ + } + } + + [pscustomobject]@{ + Markdown = $result + AnchorsInjected = $injectedCount + LinksRepaired = $linksRepaired + SourceMapLinksRepaired = $sourceMapLinksRepaired + } +} diff --git a/AwakeCoding.OpenSpecs/Public/Convert-OpenSpecToMarkdown.ps1 b/AwakeCoding.OpenSpecs/Public/Convert-OpenSpecToMarkdown.ps1 index 5be2c2e7..da8f2b73 100644 --- a/AwakeCoding.OpenSpecs/Public/Convert-OpenSpecToMarkdown.ps1 +++ b/AwakeCoding.OpenSpecs/Public/Convert-OpenSpecToMarkdown.ps1 @@ -15,7 +15,9 @@ function Convert-OpenSpecToMarkdown { [switch]$Parallel, - [int]$ThrottleLimit = 4 + [int]$ThrottleLimit = 4, + + [switch]$RemoveDocumentIndex = $true ) begin { @@ -49,9 +51,10 @@ function Convert-OpenSpecToMarkdown { $outputPathArg = $OutputPath $forceArg = $Force $sourceFormatArg = $SourceFormat + $removeIndexArg = $RemoveDocumentIndex $items | ForEach-Object -Parallel { Import-Module (Join-Path $using:moduleBase 'AwakeCoding.OpenSpecs.psd1') -Force | Out-Null - Convert-OpenSpecToMarkdown -Path $_.Path -OutputPath $using:outputPathArg -Force:$using:forceArg -SourceFormat $using:sourceFormatArg + Convert-OpenSpecToMarkdown -Path $_.Path -OutputPath $using:outputPathArg -Force:$using:forceArg -SourceFormat $using:sourceFormatArg -RemoveDocumentIndex:$using:removeIndexArg } -ThrottleLimit $ThrottleLimit return } @@ -141,7 +144,8 @@ function Convert-OpenSpecToMarkdown { $rawMarkdown = Get-Content -LiteralPath $conversionStep.OutputPath -Raw $normalized = ConvertTo-OpenSpecTextLayout -Markdown $rawMarkdown - $cleaned = Invoke-OpenSpecMarkdownCleanup -Markdown $normalized.Markdown -CurrentProtocolId $protocolId + $sourceLinkMetadata = if ($conversionStep.PSObject.Properties['LinkMetadata']) { $conversionStep.LinkMetadata } else { $null } + $cleaned = Invoke-OpenSpecMarkdownCleanup -Markdown $normalized.Markdown -CurrentProtocolId $protocolId -RemoveDocumentIndex:$RemoveDocumentIndex -SourceLinkMetadata $sourceLinkMetadata $allIssues = @() if ($normalized.Issues) { @@ -201,8 +205,14 @@ function Convert-OpenSpecToMarkdown { HasDocling = $toolchain.HasDocling HasMarkItDown = $toolchain.HasMarkItDown } + SourceLinkMetadataPath = if ($sourceLinkMetadata) { (Join-Path -Path $artifactDirectory -ChildPath 'source-link-metadata.json') } else { $null } } | ConvertTo-Json -Depth 8 | Set-Content -LiteralPath $sourceManifestPath -Encoding UTF8 + if ($sourceLinkMetadata) { + $sourceLinkMetadataPath = Join-Path -Path $artifactDirectory -ChildPath 'source-link-metadata.json' + $sourceLinkMetadata | ConvertTo-Json -Depth 10 | Set-Content -LiteralPath $sourceLinkMetadataPath -Encoding UTF8 + } + $reportPath = Join-Path -Path $artifactDirectory -ChildPath 'conversion-report.json' [pscustomobject]@{ ProtocolId = $protocolId diff --git a/AwakeCoding.OpenSpecs/Public/Get-OpenSpecCatalog.ps1 b/AwakeCoding.OpenSpecs/Public/Get-OpenSpecCatalog.ps1 index 1d0f49f8..56c23c75 100644 --- a/AwakeCoding.OpenSpecs/Public/Get-OpenSpecCatalog.ps1 +++ b/AwakeCoding.OpenSpecs/Public/Get-OpenSpecCatalog.ps1 @@ -1,7 +1,18 @@ +$script:OpenSpecReferenceDocsUri = 'https://learn.microsoft.com/en-us/openspecs/windows_protocols/ms-winprotlp/1593dc07-6116-4e9e-8aeb-85c7438fab0a' + +# Reference specs (MS-DTYP, MS-ERREF, MS-LCID, MS-UCODEREF) from https://learn.microsoft.com/en-us/openspecs/windows_protocols/ms-winprotlp/1593dc07-6116-4e9e-8aeb-85c7438fab0a +$script:OpenSpecReferenceSpecs = @( + [pscustomobject]@{ ProtocolId = 'MS-DTYP'; Title = 'Windows Data Types'; Slug = 'ms-dtyp'; SpecPageUrl = 'https://learn.microsoft.com/en-us/openspecs/windows_protocols/ms-dtyp/cca27429-5689-4a16-b2b4-9325d93e4ba2' } + [pscustomobject]@{ ProtocolId = 'MS-ERREF'; Title = 'Windows Error Codes'; Slug = 'ms-erref'; SpecPageUrl = 'https://learn.microsoft.com/en-us/openspecs/windows_protocols/ms-erref/1bc92ddf-b79e-413c-bbaa-99a5281a6c90' } + [pscustomobject]@{ ProtocolId = 'MS-LCID'; Title = 'Windows Language Code Identifier (LCID) Reference'; Slug = 'ms-lcid'; SpecPageUrl = 'https://learn.microsoft.com/en-us/openspecs/windows_protocols/ms-lcid/70feba9f-294e-491e-b6eb-56532684c37f' } + [pscustomobject]@{ ProtocolId = 'MS-UCODEREF'; Title = 'Windows Protocols Unicode Reference'; Slug = 'ms-ucoderef'; SpecPageUrl = 'https://learn.microsoft.com/en-us/openspecs/windows_protocols/ms-ucoderef/4a045e08-fc29-4f22-baf4-16f38c2825fb' } +) + function Get-OpenSpecCatalog { [CmdletBinding()] param( - [string]$Uri = 'https://learn.microsoft.com/en-us/openspecs/windows_protocols/MS-WINPROTLP/e36c976a-6263-42a8-b119-7a3cc41ddd2a' + [string]$Uri = 'https://learn.microsoft.com/en-us/openspecs/windows_protocols/MS-WINPROTLP/e36c976a-6263-42a8-b119-7a3cc41ddd2a', + [switch]$IncludeReferenceSpecs ) $response = Invoke-OpenSpecRequest -Uri $Uri @@ -79,5 +90,21 @@ function Get-OpenSpecCatalog { } } + if ($IncludeReferenceSpecs) { + foreach ($ref in $script:OpenSpecReferenceSpecs) { + if ($seen.Add($ref.ProtocolId)) { + $entries.Add([pscustomobject]@{ + PSTypeName = 'AwakeCoding.OpenSpecs.Entry' + ProtocolId = $ref.ProtocolId + Title = $ref.Title + Description = '' + SpecPageUrl = $ref.SpecPageUrl + Slug = $ref.Slug + SourcePage = $script:OpenSpecReferenceDocsUri + }) + } + } + } + $entries } diff --git a/AwakeCoding.OpenSpecs/Public/Invoke-OpenSpecConversionPipeline.ps1 b/AwakeCoding.OpenSpecs/Public/Invoke-OpenSpecConversionPipeline.ps1 index c0d052c2..f4e52567 100644 --- a/AwakeCoding.OpenSpecs/Public/Invoke-OpenSpecConversionPipeline.ps1 +++ b/AwakeCoding.OpenSpecs/Public/Invoke-OpenSpecConversionPipeline.ps1 @@ -16,7 +16,9 @@ function Invoke-OpenSpecConversionPipeline { [switch]$Parallel, - [int]$ThrottleLimit = 4 + [int]$ThrottleLimit = 4, + + [switch]$RemoveDocumentIndex = $true ) if (-not $ProtocolId -and -not $Query) { @@ -31,5 +33,5 @@ function Invoke-OpenSpecConversionPipeline { } $toConvert = $downloadResults | Where-Object { $_.Status -in 'Downloaded', 'Exists' } - $toConvert | Convert-OpenSpecToMarkdown -OutputPath $OutputPath -Force:$Force -Parallel:$Parallel -ThrottleLimit $ThrottleLimit + $toConvert | Convert-OpenSpecToMarkdown -OutputPath $OutputPath -Force:$Force -Parallel:$Parallel -ThrottleLimit $ThrottleLimit -RemoveDocumentIndex:$RemoveDocumentIndex } diff --git a/AwakeCoding.OpenSpecs/Public/Save-OpenSpecDocument.ps1 b/AwakeCoding.OpenSpecs/Public/Save-OpenSpecDocument.ps1 index 63e1cbb9..02599a37 100644 --- a/AwakeCoding.OpenSpecs/Public/Save-OpenSpecDocument.ps1 +++ b/AwakeCoding.OpenSpecs/Public/Save-OpenSpecDocument.ps1 @@ -17,7 +17,10 @@ function Save-OpenSpecDocument { [switch]$AllVersions, - [switch]$Force + [switch]$Force, + + [switch]$Parallel, + [int]$ThrottleLimit = 8 ) begin { @@ -45,8 +48,15 @@ function Save-OpenSpecDocument { } if ($item.ProtocolId) { - foreach ($link in (Get-OpenSpecDownloadLink -ProtocolId $item.ProtocolId -Format $Format -AllVersions:$AllVersions -IncludePrevious:$IncludePrevious)) { - [void]$links.Add($link) + if ($item.SpecPageUrl) { + foreach ($link in (Get-OpenSpecDownloadLink -InputObject $item -Format $Format -AllVersions:$AllVersions -IncludePrevious:$IncludePrevious)) { + [void]$links.Add($link) + } + } + else { + foreach ($link in (Get-OpenSpecDownloadLink -ProtocolId $item.ProtocolId -Format $Format -AllVersions:$AllVersions -IncludePrevious:$IncludePrevious)) { + [void]$links.Add($link) + } } } } @@ -73,6 +83,7 @@ function Save-OpenSpecDocument { } } + $toDownload = [System.Collections.Generic.List[object]]::new() foreach ($link in $links) { $fileName = $link.FileName if ([string]::IsNullOrWhiteSpace($fileName)) { @@ -94,15 +105,17 @@ function Save-OpenSpecDocument { continue } - if (-not $PSCmdlet.ShouldProcess($link.Url, "Download to $destination")) { - continue + if ($PSCmdlet.ShouldProcess($link.Url, "Download to $destination")) { + [void]$toDownload.Add([pscustomobject]@{ Link = $link; Destination = $destination }) } + } + $downloadOne = { + param($link, $destination) try { $attempt = 0 $maxRetries = 4 $delay = 1 - while ($true) { $attempt++ try { @@ -114,17 +127,12 @@ function Save-OpenSpecDocument { if ($_.Exception.Response -and $_.Exception.Response.StatusCode) { $statusCode = [int]$_.Exception.Response.StatusCode } - $transient = ($statusCode -in 429, 500, 502, 503, 504) -or (-not $statusCode) - if ($attempt -ge $maxRetries -or -not $transient) { - throw - } - + if ($attempt -ge $maxRetries -or -not $transient) { throw } Start-Sleep -Seconds $delay $delay = [Math]::Min($delay * 2, 16) } } - [pscustomobject]@{ PSTypeName = 'AwakeCoding.OpenSpecs.DownloadResult' ProtocolId = $link.ProtocolId @@ -148,5 +156,61 @@ function Save-OpenSpecDocument { } } } + + $useParallel = $Parallel -and $PSVersionTable.PSVersion.Major -ge 7 -and $toDownload.Count -gt 1 + if ($useParallel) { + $toDownload | ForEach-Object -Parallel { + $link = $_.Link + $destination = $_.Destination + try { + $attempt = 0 + $maxRetries = 4 + $delay = 1 + while ($true) { + $attempt++ + try { + Invoke-WebRequest -Uri $link.Url -OutFile $destination -MaximumRedirection 8 -ErrorAction Stop + break + } + catch { + $statusCode = $null + if ($_.Exception.Response -and $_.Exception.Response.StatusCode) { + $statusCode = [int]$_.Exception.Response.StatusCode + } + $transient = ($statusCode -in 429, 500, 502, 503, 504) -or (-not $statusCode) + if ($attempt -ge $maxRetries -or -not $transient) { throw } + Start-Sleep -Seconds $delay + $delay = [Math]::Min($delay * 2, 16) + } + } + [pscustomobject]@{ + PSTypeName = 'AwakeCoding.OpenSpecs.DownloadResult' + ProtocolId = $link.ProtocolId + Format = $link.Format + Url = $link.Url + Path = $destination + Status = 'Downloaded' + Size = (Get-Item -LiteralPath $destination).Length + } + } + catch { + [pscustomobject]@{ + PSTypeName = 'AwakeCoding.OpenSpecs.DownloadResult' + ProtocolId = $link.ProtocolId + Format = $link.Format + Url = $link.Url + Path = $destination + Status = 'Failed' + Error = $_.Exception.Message + Size = $null + } + } + } -ThrottleLimit $ThrottleLimit + } + else { + foreach ($item in $toDownload) { + & $downloadOne -link $item.Link -destination $item.Destination + } + } } } diff --git a/AwakeCoding.OpenSpecs/Public/Update-OpenSpecIndex.ps1 b/AwakeCoding.OpenSpecs/Public/Update-OpenSpecIndex.ps1 index 671e63aa..4f3a82a4 100644 --- a/AwakeCoding.OpenSpecs/Public/Update-OpenSpecIndex.ps1 +++ b/AwakeCoding.OpenSpecs/Public/Update-OpenSpecIndex.ps1 @@ -4,9 +4,13 @@ function Update-OpenSpecIndex { [Parameter(Mandatory)] [string]$Path, + [string]$Title = 'Microsoft Open Specifications', + [switch]$UseCatalogTitles = $true, - [switch]$IncludeDescription = $false + [switch]$IncludeDescription = $false, + + [string[]]$OverviewProtocolIds = @() ) if (-not (Test-Path -LiteralPath $Path)) { @@ -36,6 +40,9 @@ function Update-OpenSpecIndex { $specName = $dir.Name $mdFile = Join-Path -Path $dir.FullName -ChildPath "$specName.md" + if (-not (Test-Path -LiteralPath $mdFile)) { + $mdFile = Join-Path -Path $dir.FullName -ChildPath 'README.md' + } if (-not (Test-Path -LiteralPath $mdFile)) { $mdFile = Join-Path -Path $dir.FullName -ChildPath 'index.md' } @@ -45,18 +52,18 @@ function Update-OpenSpecIndex { } $mdFileName = [System.IO.Path]::GetFileName($mdFile) - $title = '' + $entryTitle = '' $description = '' $catalogEntry = $catalogMap[$specName] if ($catalogEntry) { - $title = $catalogEntry.Title + $entryTitle = $catalogEntry.Title if ($IncludeDescription -and $catalogEntry.Description) { $description = $catalogEntry.Description } } - if ([string]::IsNullOrWhiteSpace($title)) { + if ([string]::IsNullOrWhiteSpace($entryTitle)) { $lines = Get-Content -LiteralPath $mdFile -TotalCount 30 -ErrorAction SilentlyContinue $protocolLabelRegex = [regex]::new('^\*\*\[?(?:MS|MC)-[A-Z0-9-]+\]?\s*:\s*\*\*$', 'IgnoreCase') $boldLineRegex = [regex]::new('^\*\*(.+)\*\*$') @@ -73,45 +80,75 @@ function Update-OpenSpecIndex { if ($candidate -like "*$pat*") { $isBoilerplate = $true; break } } if (-not $isBoilerplate -and $candidate.Length -gt 2) { - $title = $candidate + $entryTitle = $candidate break } } } } - if ([string]::IsNullOrWhiteSpace($title)) { - $title = $specName + if ([string]::IsNullOrWhiteSpace($entryTitle)) { + $entryTitle = $specName } [void]$entries.Add([pscustomobject]@{ Name = $specName - Title = $title + Title = $entryTitle Description = $description Link = "$specName/$mdFileName" }) } + $overviewIds = [System.Collections.Generic.HashSet[string]]::new([System.StringComparer]::OrdinalIgnoreCase) + foreach ($id in $OverviewProtocolIds) { [void]$overviewIds.Add($id.Trim()) } + + $overviewEntries = @($entries | Where-Object { $overviewIds.Contains($_.Name) }) + $specEntries = @($entries | Where-Object { -not $overviewIds.Contains($_.Name) }) + $sb = New-Object System.Text.StringBuilder - [void]$sb.AppendLine('# Microsoft Open Specifications') + [void]$sb.AppendLine("# $Title") [void]$sb.AppendLine() - [void]$sb.AppendLine("$($entries.Count) protocol specifications converted to Markdown.") + $totalCount = $entries.Count + if ($overviewEntries.Count -gt 0 -and $specEntries.Count -gt 0) { + [void]$sb.AppendLine("$totalCount documents converted to Markdown (overview and protocol specifications).") + } + else { + [void]$sb.AppendLine("$totalCount protocol specifications converted to Markdown.") + } [void]$sb.AppendLine() - if ($IncludeDescription) { - [void]$sb.AppendLine('| Protocol | Title | Description |') - [void]$sb.AppendLine('|---|---|---|') - foreach ($entry in $entries) { - $descEscaped = ($entry.Description -replace '\|', ', ' -replace '\r?\n', ' ').Trim() - [void]$sb.AppendLine("| [$($entry.Name)]($($entry.Link)) | $($entry.Title) | $descEscaped |") + $writeTable = { + param($list, $includeDesc) + if ($includeDesc) { + [void]$sb.AppendLine('| Protocol | Title | Description |') + [void]$sb.AppendLine('|---|---|---|') + foreach ($entry in $list) { + $descEscaped = ($entry.Description -replace '\|', ', ' -replace '\r?\n', ' ').Trim() + [void]$sb.AppendLine("| [$($entry.Name)]($($entry.Link)) | $($entry.Title) | $descEscaped |") + } + } + else { + [void]$sb.AppendLine('| Protocol | Title |') + [void]$sb.AppendLine('|---|---|') + foreach ($entry in $list) { + [void]$sb.AppendLine("| [$($entry.Name)]($($entry.Link)) | $($entry.Title) |") + } } } - else { - [void]$sb.AppendLine('| Protocol | Title |') - [void]$sb.AppendLine('|---|---|') - foreach ($entry in $entries) { - [void]$sb.AppendLine("| [$($entry.Name)]($($entry.Link)) | $($entry.Title) |") + + if ($overviewEntries.Count -gt 0) { + [void]$sb.AppendLine('## Overview') + [void]$sb.AppendLine() + & $writeTable $overviewEntries $IncludeDescription + [void]$sb.AppendLine() + } + + if ($specEntries.Count -gt 0) { + if ($overviewEntries.Count -gt 0) { + [void]$sb.AppendLine('## Protocol specifications') + [void]$sb.AppendLine() } + & $writeTable $specEntries $IncludeDescription } $readmePath = Join-Path -Path $Path -ChildPath 'README.md' diff --git a/README.md b/README.md index 7157c469..a9efeea1 100644 --- a/README.md +++ b/README.md @@ -28,13 +28,13 @@ These folders are tracked with `.gitkeep`, while their contents are ignored via ## Cmdlets -- `Get-OpenSpecCatalog` - Gets Windows Protocol technical document entries from the Learn catalog page. +- `Get-OpenSpecCatalog` - Gets Windows Protocol technical document entries from the Learn catalog page. Use `-IncludeReferenceSpecs` to also include reference docs (MS-DTYP, MS-ERREF, MS-LCID, MS-UCODEREF) from [Reference Documents](https://learn.microsoft.com/en-us/openspecs/windows_protocols/ms-winprotlp/1593dc07-6116-4e9e-8aeb-85c7438fab0a). - `Find-OpenSpec` - Filters catalog entries by query or protocol ID. - `Get-OpenSpecVersion` - Resolves latest (or all) version rows for a spec page. - `Get-OpenSpecDownloadLink` - Gets download URLs for PDF and/or DOCX. - `Save-OpenSpecDocument` - Downloads selected documents (accepts pipeline from `Get-OpenSpecCatalog` or `Get-OpenSpecDownloadLink`). - `Test-OpenSpecDownload` - End-to-end validation for a set of protocol IDs. -- `Convert-OpenSpecToMarkdown` - Converts downloaded DOCX/PDF files to Markdown (supports `-Parallel -ThrottleLimit N` on PowerShell 7+). +- `Convert-OpenSpecToMarkdown` - Converts downloaded DOCX/PDF files to Markdown (supports `-Parallel -ThrottleLimit N` on PowerShell 7+). By default removes the back-of-document index section (page numbers are not meaningful in Markdown); use `-RemoveDocumentIndex:$false` to keep it. - `Invoke-OpenSpecConversionPipeline` - Download + convert in one step; use `-Parallel -ThrottleLimit N` to run conversions in parallel. - `Get-OpenSpecConversionReport` - Reads conversion report artifacts from a converted-specs output tree. - `Test-OpenSpecMarkdownFidelity` - Runs lightweight fidelity checks on generated Markdown (headings, tables, anchors, TOC links). @@ -79,6 +79,13 @@ Test-OpenSpecMarkdownFidelity -OutputPath $ConvertedPath # Generate an index README for the converted specs (e.g. for publish branch) Update-OpenSpecIndex -Path $ConvertedPath -UseCatalogTitles Update-OpenSpecIndex -Path $ConvertedPath -UseCatalogTitles -IncludeDescription +Update-OpenSpecIndex -Path $ConvertedPath -Title 'RDP Specifications' -UseCatalogTitles -IncludeDescription # custom title +Update-OpenSpecIndex -Path $ConvertedPath -Title 'RDP Specifications' -UseCatalogTitles -IncludeDescription -OverviewProtocolIds MS-RDSOD # overview first + +# Include overview documents (e.g. Remote Desktop Services Overview MS-RDSOD; not in main catalog) +$overview = [pscustomobject]@{ ProtocolId='MS-RDSOD'; SpecPageUrl='https://learn.microsoft.com/en-us/openspecs/windows_protocols/ms-rdsod/072543f9-4bd4-4dc6-ab97-9a04bf9d2c6a' } +Get-OpenSpecDownloadLink -InputObject $overview -Format DOCX | Save-OpenSpecDocument -OutputPath $DownloadPath -Force +Convert-OpenSpecToMarkdown -Path "$DownloadPath/[MS-RDSOD]-230313.docx" -OutputPath $ConvertedPath -Force # Compare converted markdown structure to live Learn pages Compare-OpenSpecToLiveHtml -OutputPath $ConvertedPath -ProtocolId MS-RDPEWA,MS-RDPBCGR diff --git a/scripts/Build-Publish.ps1 b/scripts/Build-Publish.ps1 index 31e13025..ed442ccf 100644 --- a/scripts/Build-Publish.ps1 +++ b/scripts/Build-Publish.ps1 @@ -2,8 +2,17 @@ .SYNOPSIS Builds the publish tree the same way convert-and-publish.yml does, for local validation. .DESCRIPTION - Downloads all Open Specs DOCX, converts to markdown, builds the publish directory, - and generates the README index. Use this to validate the build locally before pushing. + Downloads all Open Specs DOCX, converts to markdown, repairs broken links, + builds the publish directory, generates the README index, and optionally creates + Windows_Protocols.zip (Microsoft publishes a PDF zip with the same name; this is the markdown equivalent). + Use -Filter for faster local iteration (e.g. -Filter 'MS-RDP' for RDP-related specs). +.EXAMPLE + .\Build-Publish.ps1 + .\Build-Publish.ps1 -ZipPath '' # skip zip, publish folder only +.EXAMPLE + .\Build-Publish.ps1 -Filter 'MS-RDP' # RDP-related specs only (faster local iteration) +.EXAMPLE + .\Build-Publish.ps1 -Filter 'MS-RDP','MS-NLMP','MS-KILE' # RDP + auth specs #> [CmdletBinding()] param( @@ -11,7 +20,10 @@ param( [string]$DownloadsPath = 'downloads-convert', [string]$ConvertedPath = 'converted-specs', [string]$PublishPath = 'publish', - [int]$ThrottleLimit = 4, + [string]$ZipPath = 'Windows_Protocols.zip', + [string]$IndexTitle = 'Microsoft Open Specifications', + [string[]]$Filter = @(), + [int]$ThrottleLimit = 8, [switch]$SkipOpenXmlInstall ) @@ -36,8 +48,25 @@ try { Import-Module (Join-Path $root 'AwakeCoding.OpenSpecs') -Force Write-Host 'Downloading DOCX files...' - $downloadResults = Get-OpenSpecCatalog | - Save-OpenSpecDocument -Format DOCX -OutputPath $dlPath -Force | + $catalog = Get-OpenSpecCatalog -IncludeReferenceSpecs + $patterns = @() + if ($Filter.Count -gt 0) { + $patterns = @($Filter | Where-Object { $_ } | ForEach-Object { + if ($_ -match '[*?\[\]]') { $_ } else { "$_*" } + }) + } + if ($patterns.Count -gt 0) { + $catalog = $catalog | Where-Object { + $pid = $_.ProtocolId + foreach ($p in $patterns) { + if ($pid -like $p) { return $true } + } + return $false + } + Write-Host "Filter ($($Filter -join ', ')) -> $($catalog.Count) specs" + } + $downloadResults = $catalog | + Save-OpenSpecDocument -Format DOCX -OutputPath $dlPath -Force -Parallel -ThrottleLimit $ThrottleLimit | Where-Object { $_.Status -in 'Downloaded', 'Exists' } $toConvert = @($downloadResults) @@ -46,20 +75,23 @@ try { Write-Host 'Converting to markdown (parallel)...' $toConvert | Convert-OpenSpecToMarkdown -OutputPath $convPath -Force -Parallel -ThrottleLimit $ThrottleLimit | Out-Null + Write-Host 'Repairing broken links...' + $repairScript = Join-Path $root 'scripts\Repair-AllBrokenLinks.ps1' + & $repairScript -Path $convPath -Parallel -ThrottleLimit $ThrottleLimit + Write-Host 'Building publish directory...' New-Item -Path $pubPath -ItemType Directory -Force | Out-Null Get-ChildItem -LiteralPath $convPath -Directory | ForEach-Object { $name = $_.Name $md = Join-Path $_.FullName "$name.md" - if (-not (Test-Path -LiteralPath $md)) { - $md = Join-Path $_.FullName 'index.md' - } + if (-not (Test-Path -LiteralPath $md)) { $md = Join-Path $_.FullName 'README.md' } + if (-not (Test-Path -LiteralPath $md)) { $md = Join-Path $_.FullName 'index.md' } if (-not (Test-Path -LiteralPath $md)) { return } $dest = Join-Path $pubPath $name New-Item -Path $dest -ItemType Directory -Force | Out-Null - Copy-Item -LiteralPath $md -Destination (Join-Path $dest 'index.md') -Force + Copy-Item -LiteralPath $md -Destination $dest -Force $media = Join-Path $_.FullName 'media' if (Test-Path -LiteralPath $media -PathType Container) { @@ -68,7 +100,14 @@ try { } Write-Host 'Updating index (README.md)...' - Update-OpenSpecIndex -Path $pubPath + Update-OpenSpecIndex -Path $pubPath -Title $IndexTitle + + if ($ZipPath) { + $zipFull = if ([System.IO.Path]::IsPathRooted($ZipPath)) { $ZipPath } else { Join-Path $root $ZipPath } + Write-Host "Creating $zipFull ..." + Compress-Archive -Path (Join-Path $pubPath '*') -DestinationPath $zipFull -Force + Write-Host "Zip created: $zipFull" + } $entryCount = (Get-Content (Join-Path $pubPath 'README.md') | Select-String '^\| \[.*\]').Count Write-Host "Done. Publish folder: $pubPath ($entryCount specs)" diff --git a/scripts/Convert-TocToGitHubFriendly.ps1 b/scripts/Convert-TocToGitHubFriendly.ps1 new file mode 100644 index 00000000..5da299d9 --- /dev/null +++ b/scripts/Convert-TocToGitHubFriendly.ps1 @@ -0,0 +1,12 @@ +# Run ConvertTo-OpenSpecGitHubFriendlyToc on a spec file. +param([Parameter(Mandatory)][string]$Path) +$ErrorActionPreference = 'Stop' +$repoRoot = (Get-Item $PSScriptRoot).Parent.FullName +. (Join-Path $repoRoot 'AwakeCoding.OpenSpecs\Private\Invoke-OpenSpecMarkdownCleanup.ps1') +$md = Get-Content -LiteralPath $Path -Raw -Encoding UTF8 +$r = ConvertTo-OpenSpecGitHubFriendlyToc -Markdown $md +Write-Host 'Rewritten:' $r.Rewritten +if ($r.Rewritten) { + Set-Content -LiteralPath $Path -Value $r.Markdown -Encoding UTF8 -NoNewline + Write-Host 'File updated.' +} diff --git a/scripts/DownloadAndConvertAll.ps1 b/scripts/DownloadAndConvertAll.ps1 new file mode 100644 index 00000000..5f841e9f --- /dev/null +++ b/scripts/DownloadAndConvertAll.ps1 @@ -0,0 +1,15 @@ +# Download and convert all Windows protocol specs +$ErrorActionPreference = 'Stop' +$repoRoot = Split-Path -Parent $PSScriptRoot +Set-Location $repoRoot + +Import-Module ./AwakeCoding.OpenSpecs/AwakeCoding.OpenSpecs.psd1 -Force + +$dl = Join-Path $repoRoot 'artifacts/downloads' +$out = Join-Path $repoRoot 'artifacts/converted-specs' +New-Item -Path $dl, $out -ItemType Directory -Force | Out-Null + +Get-OpenSpecCatalog -IncludeReferenceSpecs | + Save-OpenSpecDocument -Format DOCX -OutputPath $dl -Force | + Where-Object { $_.Status -in 'Downloaded', 'Exists' } | + Convert-OpenSpecToMarkdown -OutputPath $out -Force -Parallel -ThrottleLimit 4 diff --git a/scripts/Get-BrokenLinksReport.ps1 b/scripts/Get-BrokenLinksReport.ps1 new file mode 100644 index 00000000..08cd3cca --- /dev/null +++ b/scripts/Get-BrokenLinksReport.ps1 @@ -0,0 +1,142 @@ +<# +.SYNOPSIS + Reports markdown link targets that have no matching anchor in the document. +.DESCRIPTION + Scans one or more .md files for [text](#fragment) links and <a id="..."></a> anchors, + then lists link targets with no matching anchor, grouped by category (gt_, Section_guid, Section_N.N). +.EXAMPLE + .\Get-BrokenLinksReport.ps1 -Path artifacts\converted-specs\MS-RDPBCGR\MS-RDPBCGR.md +.EXAMPLE + .\Get-BrokenLinksReport.ps1 -Path artifacts\converted-specs -OutputReport +#> +[CmdletBinding()] +param( + [Parameter(Mandatory)] + [string]$Path, + [switch]$OutputReport +) + +$ErrorActionPreference = 'Stop' +$files = if (Test-Path -LiteralPath $Path -PathType Container) { + Get-ChildItem -LiteralPath $Path -Recurse -Filter '*.md' -File | Select-Object -ExpandProperty FullName +} elseif (Test-Path -LiteralPath $Path -PathType Leaf) { + [System.IO.Path]::GetFullPath($Path) +} else { + Write-Error "Path not found: $Path" +} + +$linkRegex = [regex]::new('\[(?<text>[^\]]+)\]\(#(?<target>[^)]+)\)') +$anchorRegex = [regex]::new('<a\s+id="([^"]+)"\s*></a>', 'IgnoreCase') + +foreach ($mdPath in $files) { + $content = Get-Content -LiteralPath $mdPath -Raw -Encoding UTF8 + $anchors = [System.Collections.Generic.HashSet[string]]::new([StringComparer]::OrdinalIgnoreCase) + foreach ($m in $anchorRegex.Matches($content)) { + [void]$anchors.Add($m.Groups[1].Value) + } + $linkTargets = [System.Collections.Generic.Dictionary[string, [System.Collections.Generic.List[string]]]]::new([StringComparer]::OrdinalIgnoreCase) + $linkTextsByTarget = [System.Collections.Generic.Dictionary[string, [System.Collections.Generic.List[string]]]]::new([StringComparer]::OrdinalIgnoreCase) + foreach ($m in $linkRegex.Matches($content)) { + $target = $m.Groups['target'].Value + $text = ($m.Groups['text'].Value -replace '\*+', '').Trim() + if (-not $linkTargets.ContainsKey($target)) { + $linkTargets[$target] = [System.Collections.Generic.List[string]]::new() + } + $linkTargets[$target].Add($target) | Out-Null + if (-not $linkTextsByTarget.ContainsKey($target)) { + $linkTextsByTarget[$target] = [System.Collections.Generic.List[string]]::new() + } + if (-not [string]::IsNullOrWhiteSpace($text)) { + $linkTextsByTarget[$target].Add($text) | Out-Null + } + } + $broken = [System.Collections.Generic.List[string]]::new() + foreach ($t in $linkTargets.Keys) { + if (-not $anchors.Contains($t)) { + $broken.Add($t) + } + } + $gt = [System.Collections.Generic.List[string]]::new() + $sectionGuid = [System.Collections.Generic.List[string]]::new() + $sectionNum = [System.Collections.Generic.List[string]]::new() + foreach ($b in $broken) { + if ($b -match '^gt_[a-f0-9\-]{36}$') { $gt.Add($b) } + elseif ($b -match '^Section_[a-f0-9]{32}$') { $sectionGuid.Add($b) } + elseif ($b -match '^Section_\d+(?:\.\d+)*$') { $sectionNum.Add($b) } + } + $sectionGuidTextCounts = @{} + foreach ($target in $sectionGuid) { + if (-not $linkTextsByTarget.ContainsKey($target)) { continue } + foreach ($text in $linkTextsByTarget[$target]) { + if ([string]::IsNullOrWhiteSpace($text)) { continue } + if (-not $sectionGuidTextCounts.ContainsKey($text)) { $sectionGuidTextCounts[$text] = 0 } + $sectionGuidTextCounts[$text]++ + } + } + $protocolId = [System.IO.Path]::GetFileNameWithoutExtension($mdPath) + $report = @" +# Broken Links Report: $protocolId + +Generated from link targets that do not have a matching ``<a id="..."></a>`` in the document. + +## Summary + +| Category | Count | Description | +|----------|-------|-------------| +| **gt_ GUID** | $($gt.Count) | Glossary links still using Word bookmark IDs | +| **Section_<32hex>** | $($sectionGuid.Count) | Section links using Word GUIDs | +| **Section_X.Y.Z** (numeric) | $($sectionNum.Count) | Section number links with no anchor in doc | +| **Other** | $($broken.Count - $gt.Count - $sectionGuid.Count - $sectionNum.Count) | Other unresolved fragments | +| **Total broken** | **$($broken.Count)** | Unique link targets with no matching anchor | + +"@ + if ($gt.Count -gt 0) { + $report += "`n## 1. Glossary (gt_) links - $($gt.Count) broken`n`n" + $report += ($gt | Sort-Object | ForEach-Object { "- ``$_``" }) -join "`n" + $report += "`n" + } + if ($sectionGuid.Count -gt 0) { + $report += "`n## 2. Section GUID links - $($sectionGuid.Count) broken`n`n" + $report += "Sample: " + (($sectionGuid | Sort-Object | Select-Object -First 5) -join ", ") + if ($sectionGuid.Count -gt 5) { $report += " ... and $($sectionGuid.Count - 5) more" } + $report += "`n" + if ($sectionGuidTextCounts.Count -gt 0) { + $topPatterns = $sectionGuidTextCounts.GetEnumerator() | Sort-Object -Property Value -Descending | Select-Object -First 10 + $report += "`nTop unresolved Section GUID link texts:`n" + foreach ($p in $topPatterns) { + $report += "- ``$($p.Key)`` ($($p.Value))`n" + } + } + } + if ($sectionNum.Count -gt 0) { + $report += "`n## 3. Section number links - $($sectionNum.Count) broken`n`n" + $report += "Sample: " + (($sectionNum | Sort-Object | Select-Object -First 10) -join ", ") + if ($sectionNum.Count -gt 10) { $report += " ... and $($sectionNum.Count - 10) more" } + $report += "`n" + } + Write-Host "=== $protocolId ===" + Write-Host "Broken: $($broken.Count) (gt_: $($gt.Count), Section_guid: $($sectionGuid.Count), Section_N.N: $($sectionNum.Count))" + $conversionReportPath = Join-Path -Path ([System.IO.Path]::GetDirectoryName($mdPath)) -ChildPath 'artifacts\conversion-report.json' + if (Test-Path -LiteralPath $conversionReportPath) { + $conv = Get-Content -LiteralPath $conversionReportPath -Raw -Encoding UTF8 | ConvertFrom-Json + $deterministic = 0 + $heuristic = 0 + foreach ($issue in @($conv.Issues)) { + switch ([string]$issue.Type) { + 'GuidAnchorResolved' { $deterministic += [int]$issue.Count; break } + 'GlossaryAnchorsAndLinks' { $deterministic += [int]$issue.SourceMapLinksRepaired; $heuristic += ([int]$issue.LinksRepaired - [int]$issue.SourceMapLinksRepaired); break } + 'SectionGuidLinksRepairedByHeading' { $heuristic += [int]$issue.Count; break } + 'SectionNumberLinksRepaired' { $heuristic += [int]$issue.Count; break } + } + } + Write-Host "Repairs (conversion report): deterministic=$deterministic heuristic=$heuristic" + $report += "`n## Repair source diagnostics`n`n" + $report += "- Deterministic repairs (source-map driven): **$deterministic**`n" + $report += "- Heuristic repairs (text/title matching): **$heuristic**`n" + } + if ($OutputReport) { + $reportPath = [System.IO.Path]::Combine([System.IO.Path]::GetDirectoryName($mdPath), 'broken-links-report.md') + $report | Set-Content -LiteralPath $reportPath -Encoding UTF8 -NoNewline + Write-Host "Report written: $reportPath" + } +} diff --git a/scripts/Get-BrokenLinksSummary.ps1 b/scripts/Get-BrokenLinksSummary.ps1 new file mode 100644 index 00000000..cb63ee74 --- /dev/null +++ b/scripts/Get-BrokenLinksSummary.ps1 @@ -0,0 +1,33 @@ +# Aggregate broken link counts from Get-BrokenLinksReport output +param([string]$Path = (Join-Path (Get-Location) 'artifacts\converted-specs')) + +$reportScript = Join-Path $PSScriptRoot 'Get-BrokenLinksReport.ps1' +$tmp = [System.IO.Path]::GetTempFileName() +try { + & $reportScript -Path $Path -OutputReport *> $tmp + $lines = Get-Content -LiteralPath $tmp -Encoding utf8 +} finally { Remove-Item -LiteralPath $tmp -ErrorAction SilentlyContinue } +$total = 0 +$gtTotal = 0 +$sectionGuidTotal = 0 +$sectionNumTotal = 0 +$specsWithBroken = 0 + +foreach ($line in $lines) { + $s = [string]$line + if ($s -match 'Broken: (\d+).*gt_: (\d+).*Section_guid: (\d+).*Section_N\.N: (\d+)') { + $n = [int]$Matches[1] + if ($n -gt 0) { + $total += $n + $gtTotal += [int]$Matches[2] + $sectionGuidTotal += [int]$Matches[3] + $sectionNumTotal += [int]$Matches[4] + $specsWithBroken++ + } + } +} +Write-Host "Total broken link targets: $total" +Write-Host " - gt_ (glossary): $gtTotal" +Write-Host " - Section_<32hex>: $sectionGuidTotal" +Write-Host " - Section_N.N: $sectionNumTotal" +Write-Host "Files with broken links: $specsWithBroken" diff --git a/scripts/Remove-FrontMatterBoilerplate.ps1 b/scripts/Remove-FrontMatterBoilerplate.ps1 new file mode 100644 index 00000000..9cc3531f --- /dev/null +++ b/scripts/Remove-FrontMatterBoilerplate.ps1 @@ -0,0 +1,12 @@ +# One-off: run Remove-OpenSpecFrontMatterBoilerplate on a spec file. +param([Parameter(Mandatory)][string]$Path) +$ErrorActionPreference = 'Stop' +$repoRoot = (Get-Item $PSScriptRoot).Parent.FullName +. (Join-Path $repoRoot 'AwakeCoding.OpenSpecs\Private\Invoke-OpenSpecMarkdownCleanup.ps1') +$md = Get-Content -LiteralPath $Path -Raw -Encoding UTF8 +$r = Remove-OpenSpecFrontMatterBoilerplate -Markdown $md +Write-Host 'Removed:' $r.Removed +if ($r.Removed) { + Set-Content -LiteralPath $Path -Value $r.Markdown -Encoding UTF8 -NoNewline + Write-Host 'File updated.' +} diff --git a/scripts/Repair-AllBrokenLinks.ps1 b/scripts/Repair-AllBrokenLinks.ps1 new file mode 100644 index 00000000..1b4aa72c --- /dev/null +++ b/scripts/Repair-AllBrokenLinks.ps1 @@ -0,0 +1,96 @@ +<# +.SYNOPSIS + Runs Section GUID and Glossary link repairs on all converted specs. +.DESCRIPTION + Iterates over main .md files in converted-specs, runs Repair-OpenSpecSectionGuidLinksByHeadingMatch + and Add-OpenSpecGlossaryAnchorsAndRepairLinks, and overwrites files when repairs are made. +.EXAMPLE + .\Repair-AllBrokenLinks.ps1 -Path artifacts\converted-specs +#> +[CmdletBinding()] +param( + [Parameter()] + [string]$Path = (Join-Path (Get-Location) 'artifacts\converted-specs'), + [switch]$WhatIf, + [switch]$Parallel, + [int]$ThrottleLimit = 8 +) + +$ErrorActionPreference = 'Stop' +$repoRoot = (Get-Item $PSScriptRoot).Parent.FullName +$cleanupPath = Join-Path $repoRoot 'AwakeCoding.OpenSpecs\Private\Invoke-OpenSpecMarkdownCleanup.ps1' +if (-not (Test-Path -LiteralPath $cleanupPath)) { + Write-Error "Cleanup script not found: $cleanupPath" +} +. $cleanupPath + +$resolved = [System.IO.Path]::GetFullPath($Path) +if (-not (Test-Path -LiteralPath $resolved -PathType Container)) { + Write-Error "Path not found: $resolved" +} + +# Main spec files: <ProtocolId>/<ProtocolId>.md, exclude artifacts subdirs and reports +$specFiles = @(Get-ChildItem -LiteralPath $resolved -Directory | ForEach-Object { + $dir = $_ + $name = $dir.Name + $mdPath = Join-Path $dir.FullName "$name.md" + if (Test-Path -LiteralPath $mdPath -PathType Leaf) { $mdPath } +} | Where-Object { $_ }) + +$useParallel = $Parallel -and $PSVersionTable.PSVersion.Major -ge 7 -and $specFiles.Count -gt 1 +$whatIfArg = $WhatIf + +if ($useParallel) { + $results = $specFiles | ForEach-Object -Parallel { + . $using:cleanupPath + $mdPath = $_ + $content = Get-Content -LiteralPath $mdPath -Raw -Encoding UTF8 + $sectionResult = Repair-OpenSpecSectionGuidLinksByHeadingMatch -Markdown $content + $content = $sectionResult.Markdown + $glossaryResult = Add-OpenSpecGlossaryAnchorsAndRepairLinks -Markdown $content + $content = $glossaryResult.Markdown + $changed = ($sectionResult.LinksRepaired -gt 0) -or ($glossaryResult.AnchorsInjected -gt 0) -or ($glossaryResult.LinksRepaired -gt 0) + if ($changed -and -not $using:whatIfArg) { + Set-Content -LiteralPath $mdPath -Value $content -Encoding UTF8 -NoNewline + } + [pscustomobject]@{ + SectionRepaired = $sectionResult.LinksRepaired + GlossaryRepaired = $glossaryResult.LinksRepaired + AnchorsInjected = $glossaryResult.AnchorsInjected + Updated = $changed -and -not $using:whatIfArg + SpecName = [System.IO.Path]::GetFileName([System.IO.Path]::GetDirectoryName($mdPath)) + } + } -ThrottleLimit $ThrottleLimit + + $totalSection = ($results | Measure-Object -Property SectionRepaired -Sum).Sum + $totalGlossary = ($results | Measure-Object -Property GlossaryRepaired -Sum).Sum + $updated = ($results | Where-Object Updated).Count + foreach ($r in ($results | Where-Object Updated)) { + Write-Host "Updated: $($r.SpecName) (Section:$($r.SectionRepaired) Glossary:$($r.GlossaryRepaired)+$($r.AnchorsInjected))" + } +} +else { + $totalSection = 0 + $totalGlossary = 0 + $updated = 0 + foreach ($mdPath in $specFiles) { + $content = Get-Content -LiteralPath $mdPath -Raw -Encoding UTF8 + $sectionResult = Repair-OpenSpecSectionGuidLinksByHeadingMatch -Markdown $content + $content = $sectionResult.Markdown + $totalSection += $sectionResult.LinksRepaired + + $glossaryResult = Add-OpenSpecGlossaryAnchorsAndRepairLinks -Markdown $content + $content = $glossaryResult.Markdown + $totalGlossary += $glossaryResult.LinksRepaired + + $changed = ($sectionResult.LinksRepaired -gt 0) -or ($glossaryResult.AnchorsInjected -gt 0) -or ($glossaryResult.LinksRepaired -gt 0) + if ($changed -and -not $WhatIf) { + Set-Content -LiteralPath $mdPath -Value $content -Encoding UTF8 -NoNewline + $updated++ + $rel = [System.IO.Path]::GetFileName([System.IO.Path]::GetDirectoryName($mdPath)) + Write-Host "Updated: $rel (Section:$($sectionResult.LinksRepaired) Glossary:$($glossaryResult.LinksRepaired)+$($glossaryResult.AnchorsInjected))" + } + } +} + +Write-Host "`nTotal: Section GUID links repaired=$totalSection, Glossary links repaired=$totalGlossary, Files updated=$updated" diff --git a/scripts/Repair-GlossaryLinks.ps1 b/scripts/Repair-GlossaryLinks.ps1 new file mode 100644 index 00000000..dada2c32 --- /dev/null +++ b/scripts/Repair-GlossaryLinks.ps1 @@ -0,0 +1,12 @@ +# Run Add-OpenSpecGlossaryAnchorsAndRepairLinks on a spec file to fix gt_ GUID links. +param([Parameter(Mandatory)][string]$Path) +$ErrorActionPreference = 'Stop' +$repoRoot = (Get-Item $PSScriptRoot).Parent.FullName +. (Join-Path $repoRoot 'AwakeCoding.OpenSpecs\Private\Invoke-OpenSpecMarkdownCleanup.ps1') +$md = Get-Content -LiteralPath $Path -Raw -Encoding UTF8 +$r = Add-OpenSpecGlossaryAnchorsAndRepairLinks -Markdown $md +Write-Host 'AnchorsInjected:' $r.AnchorsInjected 'LinksRepaired:' $r.LinksRepaired +if ($r.LinksRepaired -gt 0) { + Set-Content -LiteralPath $Path -Value $r.Markdown -Encoding UTF8 -NoNewline + Write-Host 'File updated.' +} diff --git a/scripts/Repair-MissingSectionAnchors.ps1 b/scripts/Repair-MissingSectionAnchors.ps1 new file mode 100644 index 00000000..cdc7fdb1 --- /dev/null +++ b/scripts/Repair-MissingSectionAnchors.ps1 @@ -0,0 +1,36 @@ +<# +.SYNOPSIS + Injects missing Section_N.N anchors into an already-converted spec using TOC titles. +.DESCRIPTION + Runs Add-OpenSpecMissingSectionAnchorsFromToc on the given markdown file and overwrites it. + Use this to fix "Section_X.Y.Z (numeric)" broken links without reconverting from DOCX. +.EXAMPLE + .\Repair-MissingSectionAnchors.ps1 -Path artifacts\converted-specs\MS-RDPBCGR\MS-RDPBCGR.md +#> +[CmdletBinding()] +param( + [Parameter(Mandatory)] + [string]$Path +) + +$ErrorActionPreference = 'Stop' +$fullPath = [System.IO.Path]::GetFullPath($Path) +if (-not (Test-Path -LiteralPath $fullPath -PathType Leaf)) { + Write-Error "File not found: $fullPath" +} + +$repoRoot = (Get-Item $PSScriptRoot).Parent.FullName +$privateScript = Join-Path $repoRoot 'AwakeCoding.OpenSpecs\Private\Invoke-OpenSpecMarkdownCleanup.ps1' +if (-not (Test-Path -LiteralPath $privateScript -PathType Leaf)) { + Write-Error "Cleanup script not found: $privateScript" +} + +. $privateScript +$markdown = Get-Content -LiteralPath $fullPath -Raw -Encoding UTF8 +$result = Add-OpenSpecMissingSectionAnchorsFromToc -Markdown $markdown +if ($result.InjectedCount -gt 0) { + $result.Markdown | Set-Content -LiteralPath $fullPath -Encoding UTF8 -NoNewline + Write-Host "Injected $($result.InjectedCount) missing section anchor(s). File updated: $fullPath" +} else { + Write-Host "No missing section anchors to inject." +} diff --git a/scripts/Repair-SectionGuidLinks.ps1 b/scripts/Repair-SectionGuidLinks.ps1 new file mode 100644 index 00000000..50c7b98a --- /dev/null +++ b/scripts/Repair-SectionGuidLinks.ps1 @@ -0,0 +1,12 @@ +# Run Repair-OpenSpecSectionGuidLinksByHeadingMatch on a spec file. +param([Parameter(Mandatory)][string]$Path) +$ErrorActionPreference = 'Stop' +$repoRoot = (Get-Item $PSScriptRoot).Parent.FullName +. (Join-Path $repoRoot 'AwakeCoding.OpenSpecs\Private\Invoke-OpenSpecMarkdownCleanup.ps1') +$md = Get-Content -LiteralPath $Path -Raw -Encoding UTF8 +$r = Repair-OpenSpecSectionGuidLinksByHeadingMatch -Markdown $md +Write-Host 'LinksRepaired:' $r.LinksRepaired +if ($r.LinksRepaired -gt 0) { + Set-Content -LiteralPath $Path -Value $r.Markdown -Encoding UTF8 -NoNewline + Write-Host 'File updated.' +} diff --git a/scripts/Test-DocxLinkMetadataCapture.ps1 b/scripts/Test-DocxLinkMetadataCapture.ps1 new file mode 100644 index 00000000..da269204 --- /dev/null +++ b/scripts/Test-DocxLinkMetadataCapture.ps1 @@ -0,0 +1,49 @@ +[CmdletBinding()] +param( + [Parameter(Mandatory)] + [string]$DocxPath, + + [Parameter(Mandatory)] + [string]$OutputMarkdownPath +) + +$ErrorActionPreference = 'Stop' +$repoRoot = (Get-Item $PSScriptRoot).Parent.FullName +$privateDir = Join-Path $repoRoot 'AwakeCoding.OpenSpecs\Private' +Get-ChildItem -LiteralPath $privateDir -Filter '*.ps1' | ForEach-Object { + . $_.FullName +} + +$toolchain = [pscustomobject]@{ + HasOpenXml = $false +} + +try { + $result = ConvertFrom-OpenSpecDocxWithOpenXml -InputPath $DocxPath -OutputPath $OutputMarkdownPath -Toolchain $toolchain +} +catch { + Write-Host "Exception type: $($_.Exception.GetType().FullName)" + Write-Host "Message: $($_.Exception.Message)" + if ($_.InvocationInfo) { + Write-Host "Position: $($_.InvocationInfo.PositionMessage)" + } + if ($_.ScriptStackTrace) { + Write-Host "Stack:" + Write-Host $_.ScriptStackTrace + } + if ($_.Exception.InnerException) { + Write-Host "Inner: $($_.Exception.InnerException.GetType().FullName): $($_.Exception.InnerException.Message)" + } + throw +} + +if (-not $result.PSObject.Properties['LinkMetadata']) { + throw 'LinkMetadata was not returned from DOCX conversion step.' +} + +$meta = $result.LinkMetadata +Write-Host "GuidToSection: $($meta.GuidToSection.Count)" +Write-Host "SectionToTitle: $($meta.SectionToTitle.Count)" +Write-Host "TocAlias: $($meta.TocAlias.Count)" +Write-Host "GuidToGlossarySlug: $($meta.GuidToGlossarySlug.Count)" +Write-Host "InternalHyperlinks: $(@($meta.InternalHyperlinks).Count)"