...
1<#
2 .SYNOPSIS
3 Check broken links.
4
5 .DESCRIPTION
6 The Verify-Links.ps1 script will check whether the files contain any broken links.
7
8 .PARAMETER urls
9 Specify url list to verify links. Can either be a http address or a local file request. Local file paths support md and html files.
10
11 .PARAMETER ignoreLinksFile
12 Specifies the file that contains a set of links to ignore when verifying.
13
14 .PARAMETER devOpsLogging
15 Switch that will enable devops specific logging for warnings
16
17 .PARAMETER recursive
18 Check the links recurisvely based on recursivePattern.
19
20 .PARAMETER baseUrl
21 Recursively check links for all links verified that begin with this baseUrl, defaults to the folder the url is contained in.
22
23 .PARAMETER rootUrl
24 Path to the root of the site for resolving rooted relative links, defaults to host root for http and file directory for local files.
25
26 .PARAMETER errorStatusCodes
27 List of http status codes that count as broken links. Defaults to 400, 401, 404, SocketError.HostNotFound = 11001, SocketError.NoData = 11004.
28
29 .PARAMETER branchReplaceRegex
30 Regex to check if the link needs to be replaced. E.g. ^(https://github.com/.*/(?:blob|tree)/)main(/.*)$
31
32 .PARAMETER branchReplacementName
33 The substitute branch name or SHA commit.
34
35 .PARAMETER checkLinkGuidance
36 Flag to allow checking against azure sdk link guidance. Check link guidance here: https://aka.ms/azsdk/guideline/links.
37
38 .PARAMETER userAgent
39 UserAgent to be configured for web requests. Defaults to current Chrome version.
40
41 .PARAMETER inputCacheFile
42 Path to a file that contains a list of links that are known valid so we can skip checking them.
43
44 .PARAMETER outputCacheFile
45 Path to a file that the script will output all the validated links after running all checks.
46
47 .PARAMETER requestTimeoutSec
48 The number of seconds before we timeout when sending an individual web request. Default is 15 seconds.
49
50 .EXAMPLE
51 PS> .\Verify-Links.ps1 C:\README.md
52
53 .EXAMPLE
54 PS> .\Verify-Links.ps1 https://azure.github.io/azure-sdk/index.html
55
56 .EXAMPLE
57 PS> .\Verify-Links C:\README.md -checkLinkGuidance $true
58#>
59[CmdletBinding()]
60param (
61 [string[]] $urls,
62 [string] $ignoreLinksFile = "$PSScriptRoot/ignore-links.txt",
63 [switch] $devOpsLogging = $false,
64 [switch] $recursive = $true,
65 [string] $baseUrl = "",
66 [string] $rootUrl = "",
67 [array] $errorStatusCodes = @(400, 401, 404, 11001, 11004),
68 [string] $branchReplaceRegex = "",
69 [string] $branchReplacementName = "",
70 [bool] $checkLinkGuidance = $false,
71 [string] $userAgent,
72 [string] $inputCacheFile,
73 [string] $outputCacheFile,
74 [string] $requestTimeoutSec = 15
75)
76
77$ProgressPreference = "SilentlyContinue"; # Disable invoke-webrequest progress dialog
78# Regex of the locale keywords.
79$locale = "/en-us/"
80$emptyLinkMessage = "There is at least one empty link in the page. Please replace with absolute link. Check here for more information: https://aka.ms/azsdk/guideline/links"
81if (!$userAgent) {
82 $userAgent = "Chrome/87.0.4280.88"
83}
84function NormalizeUrl([string]$url){
85 if (Test-Path $url) {
86 $url = "file://" + (Resolve-Path $url).ToString();
87 }
88
89 Write-Verbose "The url to check against: $url."
90 $uri = [System.Uri]$url;
91
92 if ($script:baseUrl -eq "") {
93 # for base url default to containing directory
94 $script:baseUrl = (new-object System.Uri($uri, ".")).ToString();
95 }
96
97 if ($script:rootUrl -eq "") {
98 if ($uri.IsFile) {
99 # for files default to the containing directory
100 $script:rootUrl = $script:baseUrl;
101 }
102 else {
103 # for http links default to the root path
104 $script:rootUrl = new-object System.Uri($uri, "/");
105 }
106 }
107 return $uri
108}
109
110function LogWarning
111{
112 if ($devOpsLogging)
113 {
114 Write-Host "##vso[task.LogIssue type=warning;]$args"
115 }
116 else
117 {
118 Write-Warning "$args"
119 }
120}
121
122function LogError
123{
124 if ($devOpsLogging)
125 {
126 Write-Host "##vso[task.logissue type=error]$args"
127 }
128 else
129 {
130 Write-Error "$args"
131 }
132}
133
134function ResolveUri ([System.Uri]$referralUri, [string]$link)
135{
136 # If the link is mailto, skip it.
137 if ($link.StartsWith("mailto:")) {
138 Write-Verbose "Skipping $link because it is a mailto link."
139 return
140 }
141
142 $linkUri = [System.Uri]$link;
143 # Our link guidelines do not allow relative links so only resolve them when we are not
144 # validating links against our link guidelines (i.e. !$checkLinkGuideance)
145 if ($checkLinkGuidance -and !$linkUri.IsAbsoluteUri) {
146 return $linkUri
147 }
148
149 if (!$linkUri.IsAbsoluteUri) {
150 # For rooted paths resolve from the baseUrl
151 if ($link.StartsWith("/")) {
152 Write-Verbose "rooturl = $rootUrl"
153 $linkUri = new-object System.Uri([System.Uri]$rootUrl, ".$link");
154 }
155 else {
156 $linkUri = new-object System.Uri($referralUri, $link);
157 }
158 }
159
160 $linkUri = [System.Uri]$linkUri.GetComponents([System.UriComponents]::HttpRequestUrl, [System.UriFormat]::SafeUnescaped)
161 Write-Verbose "ResolvedUri $link to $linkUri"
162
163 # If the link is not a web request, like mailto, skip it.
164 if (!$linkUri.Scheme.StartsWith("http") -and !$linkUri.IsFile) {
165 Write-Verbose "Skipping $linkUri because it is not http or file based."
166 return
167 }
168
169 if ($null -ne $ignoreLinks -and ($ignoreLinks.Contains($link) -or $ignoreLinks.Contains($linkUri.ToString()))) {
170 Write-Verbose "Ignoring invalid link $linkUri because it is in the ignore file."
171 return
172 }
173
174 return $linkUri;
175}
176
177function ParseLinks([string]$baseUri, [string]$htmlContent)
178{
179 $hrefRegex = "<a[^>]+href\s*=\s*[""']?(?<href>[^""']*)[""']?"
180 $regexOptions = [System.Text.RegularExpressions.RegexOptions]"Singleline, IgnoreCase";
181
182 $hrefs = [RegEx]::Matches($htmlContent, $hrefRegex, $regexOptions);
183
184 #$hrefs | Foreach-Object { Write-Host $_ }
185
186 Write-Verbose "Found $($hrefs.Count) raw href's in page $baseUri";
187 $links = $hrefs | ForEach-Object { ResolveUri $baseUri $_.Groups["href"].Value }
188
189 #$links | Foreach-Object { Write-Host $_ }
190
191 return $links
192}
193
194function CheckLink ([System.Uri]$linkUri, $allowRetry=$true)
195{
196 if(!$linkUri.ToString().Trim()) {
197 LogWarning "Found Empty link. Please use absolute link instead. Check here for more information: https://aka.ms/azsdk/guideline/links"
198 return $false
199 }
200
201 $originalLinkUri = $linkUri
202 $linkUri = ReplaceGithubLink $linkUri
203
204 $link = $linkUri.ToString()
205
206 if ($checkedLinks.ContainsKey($link)) {
207 if (!$checkedLinks[$link]) {
208 LogWarning "broken link $link"
209 }
210 return $checkedLinks[$link]
211 }
212
213 $linkValid = $true
214 Write-Verbose "Checking link $linkUri..."
215
216 if ($linkUri.IsFile) {
217 if (!(Test-Path $linkUri.LocalPath)) {
218 LogWarning "Link to file does not exist $($linkUri.LocalPath)"
219 $linkValid = $false
220 }
221 }
222 elseif ($linkUri.IsAbsoluteUri) {
223 try {
224 $headRequestSucceeded = $true
225 try {
226 # Attempt HEAD request first
227 $response = Invoke-WebRequest -Uri $linkUri -Method HEAD -UserAgent $userAgent -TimeoutSec $requestTimeoutSec
228 }
229 catch {
230 $headRequestSucceeded = $false
231 }
232 if (!$headRequestSucceeded) {
233 # Attempt a GET request if the HEAD request failed.
234 $response = Invoke-WebRequest -Uri $linkUri -Method GET -UserAgent $userAgent -TimeoutSec $requestTimeoutSec
235 }
236 $statusCode = $response.StatusCode
237 if ($statusCode -ne 200) {
238 Write-Host "[$statusCode] while requesting $linkUri"
239 }
240 }
241 catch {
242 $statusCode = $_.Exception.Response.StatusCode.value__
243
244 if(!$statusCode) {
245 # Try to pull the error code from any inner SocketException we might hit
246 $statusCode = $_.Exception.InnerException.ErrorCode
247 }
248
249 if ($statusCode -in $errorStatusCodes) {
250 if ($originalLinkUri -ne $linkUri) {
251 LogError "[$statusCode] broken link $originalLinkUri (resolved to $linkUri)"
252 }
253 else {
254 LogError "[$statusCode] broken link $linkUri"
255 }
256
257 $linkValid = $false
258 }
259 else {
260 if ($null -ne $statusCode) {
261 # For 429 rate-limiting try to pause if possible
262 if ($allowRetry -and $_.Exception.Response -and $statusCode -eq 429) {
263 $retryAfter = $_.Exception.Response.Headers.RetryAfter.Delta.TotalSeconds
264
265 # Default retry after 60 (arbitrary) seconds if no header given
266 if (!$retryAfter -or $retryAfter -gt 60) { $retryAfter = 60 }
267 Write-Host "Rate-Limited for $retryAfter seconds while requesting $linkUri"
268
269 Start-Sleep -Seconds $retryAfter
270 $linkValid = CheckLink $originalLinkUri -allowRetry $false
271 }
272 else {
273 Write-Host "[$statusCode] handled while requesting $linkUri"
274 # Override and set status code in the cache so it is truthy
275 # so we don't keep checking but we don't think it is valid either
276 $linkValid = $statusCode
277 }
278 }
279 else {
280 Write-Host "Exception while requesting $linkUri"
281 Write-Host $_.Exception.ToString()
282 # Override and set exception in the cache so it is truthy
283 # so we don't keep checking but we don't think it is valid either
284 $linkValid = "Exception"
285 }
286 }
287 }
288 }
289 elseif ($link.StartsWith("#")) {
290 # Ignore anchor links as we don't have a great way to check them.
291 }
292 else {
293 LogWarning "Link has invalid format $linkUri"
294 $linkValid = $false
295 }
296
297 if ($checkLinkGuidance) {
298 if ($linkUri.Scheme -eq 'http') {
299 LogWarning "DO NOT use 'http' in $linkUri. Please use secure link with https instead. Check here for more information: https://aka.ms/azsdk/guideline/links"
300 $linkValid = $false
301 }
302 # Check if the url is relative links, suppress the archor link validation.
303 if (!$linkUri.IsAbsoluteUri -and !$link.StartsWith("#")) {
304 LogWarning "DO NOT use relative link $linkUri. Please use absolute link instead. Check here for more information: https://aka.ms/azsdk/guideline/links"
305 $linkValid = $false
306 }
307 # Check if the url is anchor link has any uppercase.
308 if ($link -cmatch '#[^?]*[A-Z]') {
309 LogWarning "Please lower case your anchor tags (i.e. anything after '#' in your link '$linkUri'. Check here for more information: https://aka.ms/azsdk/guideline/links"
310 $linkValid = $false
311 }
312 # Check if link uri includes locale info.
313 if ($linkUri -match $locale) {
314 LogWarning "DO NOT include locale $locale information in links: $linkUri. Check here for more information: https://aka.ms/azsdk/guideline/links"
315 $linkValid = $false
316 }
317 }
318
319 $checkedLinks[$link] = $linkValid
320 return $linkValid
321}
322
323function ReplaceGithubLink([string]$originLink) {
324 if (!$branchReplacementName -or !$branchReplaceRegex) {
325 return $originLink
326 }
327 $ReplacementPattern = "`${1}$branchReplacementName`$2"
328 return $originLink -replace $branchReplaceRegex, $ReplacementPattern
329}
330
331function GetLinks([System.Uri]$pageUri)
332{
333 if ($pageUri.Scheme.StartsWith("http")) {
334 try {
335 $response = Invoke-WebRequest -Uri $pageUri -UserAgent $userAgent -TimeoutSec $requestTimeoutSec
336 $content = $response.Content
337
338 if ($pageUri.ToString().EndsWith(".md")) {
339 $content = (ConvertFrom-MarkDown -InputObject $content).html
340 }
341 }
342 catch {
343 $statusCode = $_.Exception.Response.StatusCode.value__
344 Write-Error "Invalid page [$statusCode] $pageUri"
345 }
346 }
347 elseif ($pageUri.IsFile -and (Test-Path $pageUri.LocalPath)) {
348 $file = $pageUri.LocalPath
349 if ($file.EndsWith(".md")) {
350 $content = (ConvertFrom-MarkDown $file).html
351 }
352 elseif ($file.EndsWith(".html")) {
353 $content = Get-Content $file
354 }
355 else {
356 if (Test-Path ($file + "index.html")) {
357 $content = Get-Content ($file + "index.html")
358 }
359 else {
360 # Fallback to just reading the content directly
361 $content = Get-Content $file
362 }
363 }
364 }
365 else {
366 Write-Error "Don't know how to process uri $pageUri"
367 }
368
369 $links = ParseLinks $pageUri $content
370
371 return $links;
372}
373
374if ($urls) {
375 if ($urls.Count -eq 0) {
376 Write-Host "Usage $($MyInvocation.MyCommand.Name) <urls>";
377 exit 1;
378 }
379}
380
381if ($PSVersionTable.PSVersion.Major -lt 6)
382{
383 LogWarning "Some web requests will not work in versions of PS earlier then 6. You are running version $($PSVersionTable.PSVersion)."
384}
385$ignoreLinks = @();
386if (Test-Path $ignoreLinksFile) {
387 $ignoreLinks = (Get-Content $ignoreLinksFile).Where({ $_.Trim() -ne "" -and !$_.StartsWith("#") })
388}
389
390# Use default hashtable constructor instead of @{} because we need them to be case sensitive
391$checkedPages = New-Object Hashtable
392$checkedLinks = New-Object Hashtable
393
394if ($inputCacheFile)
395{
396 $cacheContent = ""
397 if ($inputCacheFile.StartsWith("http")) {
398 try {
399 $response = Invoke-WebRequest -Uri $inputCacheFile -TimeoutSec $requestTimeoutSec
400 $cacheContent = $response.Content
401 }
402 catch {
403 $statusCode = $_.Exception.Response.StatusCode.value__
404 Write-Error "Failed to read cache file from page [$statusCode] $inputCacheFile"
405 }
406 }
407 elseif (Test-Path $inputCacheFile) {
408 $cacheContent = Get-Content $inputCacheFile -Raw
409 }
410 $goodLinks = $cacheContent.Split("`n").Where({ $_.Trim() -ne "" -and !$_.StartsWith("#") })
411
412 foreach ($goodLink in $goodLinks) {
413 $checkedLinks[$goodLink] = $true
414 }
415}
416
417$cachedLinksCount = $checkedLinks.Count
418
419if ($cachedLinksCount) {
420 Write-Host "Skipping checks on $cachedLinksCount links found in the given cache of known good links."
421}
422
423$badLinks = New-Object Hashtable
424$pageUrisToCheck = new-object System.Collections.Queue
425foreach ($url in $urls) {
426 $uri = NormalizeUrl $url
427 $pageUrisToCheck.Enqueue($uri);
428}
429
430if ($devOpsLogging) {
431 Write-Host "##[group]Link checking details"
432}
433while ($pageUrisToCheck.Count -ne 0)
434{
435 $pageUri = $pageUrisToCheck.Dequeue();
436 if ($checkedPages.ContainsKey($pageUri)) { continue }
437 $checkedPages[$pageUri] = $true;
438
439 $linkUris = GetLinks $pageUri
440 Write-Host "Checking $($linkUris.Count) links found on page $pageUri";
441 $badLinksPerPage = @();
442 foreach ($linkUri in $linkUris) {
443 $isLinkValid = CheckLink $linkUri
444 if (!$isLinkValid -and !$badLinksPerPage.Contains($linkUri)) {
445 if (!$linkUri.ToString().Trim()) {
446 $linkUri = $emptyLinkMessage
447 }
448 $badLinksPerPage += $linkUri
449 }
450 if ($recursive -and $isLinkValid) {
451 if ($linkUri.ToString().StartsWith($baseUrl) -and !$checkedPages.ContainsKey($linkUri)) {
452 $pageUrisToCheck.Enqueue($linkUri);
453 }
454 }
455 }
456 if ($badLinksPerPage.Count -gt 0) {
457 $badLinks[$pageUri] = $badLinksPerPage
458 }
459}
460if ($devOpsLogging) {
461 Write-Host "##[endgroup]"
462}
463
464if ($badLinks.Count -gt 0) {
465 Write-Host "Summary of broken links:"
466}
467foreach ($pageLink in $badLinks.Keys) {
468 Write-Host "'$pageLink' has $($badLinks[$pageLink].Count) broken link(s):"
469 foreach ($brokenLink in $badLinks[$pageLink]) {
470 Write-Host " $brokenLink"
471 }
472}
473
474$linksChecked = $checkedLinks.Count - $cachedLinksCount
475
476if ($badLinks.Count -gt 0) {
477 Write-Host "Checked $linksChecked links with $($badLinks.Count) broken link(s) found."
478}
479else {
480 Write-Host "Checked $linksChecked links. No broken links found."
481}
482
483if ($outputCacheFile)
484{
485 $goodLinks = $checkedLinks.Keys.Where({ "True" -eq $checkedLinks[$_].ToString() }) | Sort-Object
486
487 Write-Host "Writing the list of validated links to $outputCacheFile"
488 $goodLinks | Set-Content $outputCacheFile
489}
490
491exit $badLinks.Count
View as plain text