...

Text file src/github.com/Azure/azure-sdk-for-go/eng/common/scripts/Verify-Links.ps1

Documentation: github.com/Azure/azure-sdk-for-go/eng/common/scripts

     1<#
     2  .SYNOPSIS
     3  Check broken links.
     4
     5  .DESCRIPTION
     6  The Verify-Links.ps1 script will check whether the files contain any broken links.
     7
     8  .PARAMETER urls
     9  Specify url list to verify links. Can either be a http address or a local file request. Local file paths support md and html files.
    10
    11  .PARAMETER ignoreLinksFile
    12  Specifies the file that contains a set of links to ignore when verifying.
    13
    14  .PARAMETER devOpsLogging
    15  Switch that will enable devops specific logging for warnings
    16
    17  .PARAMETER recursive
    18  Check the links recurisvely based on recursivePattern.
    19
    20  .PARAMETER baseUrl
    21  Recursively check links for all links verified that begin with this baseUrl, defaults to the folder the url is contained in.
    22
    23  .PARAMETER rootUrl
    24  Path to the root of the site for resolving rooted relative links, defaults to host root for http and file directory for local files.
    25
    26  .PARAMETER errorStatusCodes
    27  List of http status codes that count as broken links. Defaults to 400, 401, 404, SocketError.HostNotFound = 11001, SocketError.NoData = 11004.
    28
    29  .PARAMETER branchReplaceRegex
    30  Regex to check if the link needs to be replaced. E.g. ^(https://github.com/.*/(?:blob|tree)/)main(/.*)$
    31
    32  .PARAMETER branchReplacementName
    33  The substitute branch name or SHA commit.
    34
    35  .PARAMETER checkLinkGuidance
    36  Flag to allow checking against azure sdk link guidance. Check link guidance here: https://aka.ms/azsdk/guideline/links.
    37
    38  .PARAMETER userAgent
    39  UserAgent to be configured for web requests. Defaults to current Chrome version.
    40
    41  .PARAMETER inputCacheFile
    42  Path to a file that contains a list of links that are known valid so we can skip checking them.
    43
    44  .PARAMETER outputCacheFile
    45  Path to a file that the script will output all the validated links after running all checks.
    46  
    47  .PARAMETER requestTimeoutSec
    48  The number of seconds before we timeout when sending an individual web request. Default is 15 seconds.
    49
    50  .EXAMPLE
    51  PS> .\Verify-Links.ps1 C:\README.md
    52
    53  .EXAMPLE
    54  PS> .\Verify-Links.ps1 https://azure.github.io/azure-sdk/index.html
    55
    56  .EXAMPLE
    57  PS> .\Verify-Links C:\README.md -checkLinkGuidance $true
    58#>
    59[CmdletBinding()]
    60param (
    61  [string[]] $urls,
    62  [string] $ignoreLinksFile = "$PSScriptRoot/ignore-links.txt",
    63  [switch] $devOpsLogging = $false,
    64  [switch] $recursive = $true,
    65  [string] $baseUrl = "",
    66  [string] $rootUrl = "",
    67  [array] $errorStatusCodes = @(400, 401, 404, 11001, 11004),
    68  [string] $branchReplaceRegex = "",
    69  [string] $branchReplacementName = "",
    70  [bool] $checkLinkGuidance = $false,
    71  [string] $userAgent,
    72  [string] $inputCacheFile,
    73  [string] $outputCacheFile,
    74  [string] $requestTimeoutSec  = 15
    75)
    76
    77$ProgressPreference = "SilentlyContinue"; # Disable invoke-webrequest progress dialog
    78# Regex of the locale keywords.
    79$locale = "/en-us/"
    80$emptyLinkMessage = "There is at least one empty link in the page. Please replace with absolute link. Check here for more information: https://aka.ms/azsdk/guideline/links"
    81if (!$userAgent) {
    82  $userAgent = "Chrome/87.0.4280.88"
    83}
    84function NormalizeUrl([string]$url){
    85  if (Test-Path $url) {
    86    $url = "file://" + (Resolve-Path $url).ToString();
    87  }
    88
    89  Write-Verbose "The url to check against: $url."
    90  $uri = [System.Uri]$url;
    91
    92  if ($script:baseUrl -eq "") {
    93    # for base url default to containing directory
    94    $script:baseUrl = (new-object System.Uri($uri, ".")).ToString();
    95  }
    96
    97  if ($script:rootUrl -eq "") {
    98    if ($uri.IsFile) {
    99      # for files default to the containing directory
   100      $script:rootUrl = $script:baseUrl;
   101    }
   102    else {
   103      # for http links default to the root path
   104      $script:rootUrl = new-object System.Uri($uri, "/");
   105    }
   106  }
   107  return $uri
   108}
   109
   110function LogWarning
   111{
   112  if ($devOpsLogging)
   113  {
   114    Write-Host "##vso[task.LogIssue type=warning;]$args"
   115  }
   116  else
   117  {
   118    Write-Warning "$args"
   119  }
   120}
   121
   122function LogError
   123{
   124  if ($devOpsLogging)
   125  {
   126    Write-Host "##vso[task.logissue type=error]$args"
   127  }
   128  else
   129  {
   130    Write-Error "$args"
   131  }
   132}
   133
   134function ResolveUri ([System.Uri]$referralUri, [string]$link)
   135{
   136  # If the link is mailto, skip it.
   137  if ($link.StartsWith("mailto:")) {
   138    Write-Verbose "Skipping $link because it is a mailto link."
   139    return
   140  }
   141
   142  $linkUri = [System.Uri]$link;
   143  # Our link guidelines do not allow relative links so only resolve them when we are not
   144  # validating links against our link guidelines (i.e. !$checkLinkGuideance)
   145  if ($checkLinkGuidance -and !$linkUri.IsAbsoluteUri) {
   146    return $linkUri
   147  }
   148
   149  if (!$linkUri.IsAbsoluteUri) {
   150    # For rooted paths resolve from the baseUrl
   151    if ($link.StartsWith("/")) {
   152      Write-Verbose "rooturl = $rootUrl"
   153      $linkUri = new-object System.Uri([System.Uri]$rootUrl, ".$link");
   154    }
   155    else {
   156      $linkUri = new-object System.Uri($referralUri, $link);
   157    }
   158  }
   159
   160  $linkUri = [System.Uri]$linkUri.GetComponents([System.UriComponents]::HttpRequestUrl, [System.UriFormat]::SafeUnescaped)
   161  Write-Verbose "ResolvedUri $link to $linkUri"
   162
   163  # If the link is not a web request, like mailto, skip it.
   164  if (!$linkUri.Scheme.StartsWith("http") -and !$linkUri.IsFile) {
   165    Write-Verbose "Skipping $linkUri because it is not http or file based."
   166    return
   167  }
   168
   169  if ($null -ne $ignoreLinks -and ($ignoreLinks.Contains($link) -or $ignoreLinks.Contains($linkUri.ToString()))) {
   170    Write-Verbose "Ignoring invalid link $linkUri because it is in the ignore file."
   171    return
   172  }
   173
   174  return $linkUri;
   175}
   176
   177function ParseLinks([string]$baseUri, [string]$htmlContent)
   178{
   179  $hrefRegex = "<a[^>]+href\s*=\s*[""']?(?<href>[^""']*)[""']?"
   180  $regexOptions = [System.Text.RegularExpressions.RegexOptions]"Singleline, IgnoreCase";
   181
   182  $hrefs = [RegEx]::Matches($htmlContent, $hrefRegex, $regexOptions);
   183
   184  #$hrefs | Foreach-Object { Write-Host $_ }
   185
   186  Write-Verbose "Found $($hrefs.Count) raw href's in page $baseUri";
   187  $links = $hrefs | ForEach-Object { ResolveUri $baseUri $_.Groups["href"].Value }
   188
   189  #$links | Foreach-Object { Write-Host $_ }
   190
   191  return $links
   192}
   193
   194function CheckLink ([System.Uri]$linkUri, $allowRetry=$true)
   195{
   196  if(!$linkUri.ToString().Trim()) {
   197    LogWarning "Found Empty link. Please use absolute link instead. Check here for more information: https://aka.ms/azsdk/guideline/links"
   198    return $false
   199  }
   200
   201  $originalLinkUri = $linkUri
   202  $linkUri = ReplaceGithubLink $linkUri
   203
   204  $link = $linkUri.ToString()
   205
   206  if ($checkedLinks.ContainsKey($link)) {
   207    if (!$checkedLinks[$link]) {
   208      LogWarning "broken link $link"
   209    }
   210    return $checkedLinks[$link]
   211  }
   212
   213  $linkValid = $true
   214  Write-Verbose "Checking link $linkUri..."
   215
   216  if ($linkUri.IsFile) {
   217    if (!(Test-Path $linkUri.LocalPath)) {
   218      LogWarning "Link to file does not exist $($linkUri.LocalPath)"
   219      $linkValid = $false
   220    }
   221  }
   222  elseif ($linkUri.IsAbsoluteUri) {
   223    try {
   224      $headRequestSucceeded = $true
   225      try {
   226        # Attempt HEAD request first
   227        $response = Invoke-WebRequest -Uri $linkUri -Method HEAD -UserAgent $userAgent -TimeoutSec $requestTimeoutSec
   228      }
   229      catch {
   230        $headRequestSucceeded = $false
   231      }
   232      if (!$headRequestSucceeded) {
   233        # Attempt a GET request if the HEAD request failed.
   234        $response = Invoke-WebRequest -Uri $linkUri -Method GET -UserAgent $userAgent -TimeoutSec $requestTimeoutSec
   235      }
   236      $statusCode = $response.StatusCode
   237      if ($statusCode -ne 200) {
   238        Write-Host "[$statusCode] while requesting $linkUri"
   239      }
   240    }
   241    catch {
   242      $statusCode = $_.Exception.Response.StatusCode.value__
   243
   244      if(!$statusCode) {
   245        # Try to pull the error code from any inner SocketException we might hit
   246        $statusCode = $_.Exception.InnerException.ErrorCode
   247      }
   248
   249      if ($statusCode -in $errorStatusCodes) {
   250        if ($originalLinkUri -ne $linkUri) {
   251          LogError "[$statusCode] broken link $originalLinkUri (resolved to $linkUri)"
   252        }
   253        else {
   254          LogError "[$statusCode] broken link $linkUri"
   255        }
   256
   257        $linkValid = $false
   258      }
   259      else {
   260        if ($null -ne $statusCode) {
   261          # For 429 rate-limiting try to pause if possible
   262          if ($allowRetry -and $_.Exception.Response -and $statusCode -eq 429) {
   263            $retryAfter = $_.Exception.Response.Headers.RetryAfter.Delta.TotalSeconds
   264
   265            # Default retry after 60 (arbitrary) seconds if no header given
   266            if (!$retryAfter -or $retryAfter -gt 60) { $retryAfter = 60 }
   267            Write-Host "Rate-Limited for $retryAfter seconds while requesting $linkUri"
   268
   269            Start-Sleep -Seconds $retryAfter
   270            $linkValid = CheckLink $originalLinkUri -allowRetry $false
   271          }
   272          else {
   273            Write-Host "[$statusCode] handled while requesting $linkUri"
   274            # Override and set status code in the cache so it is truthy
   275            # so we don't keep checking but we don't think it is valid either
   276            $linkValid = $statusCode
   277          }
   278        }
   279        else {
   280          Write-Host "Exception while requesting $linkUri"
   281          Write-Host $_.Exception.ToString()
   282          # Override and set exception in the cache so it is truthy
   283          # so we don't keep checking but we don't think it is valid either
   284          $linkValid = "Exception"
   285        }
   286      }
   287    }
   288  }
   289  elseif ($link.StartsWith("#")) {
   290    # Ignore anchor links as we don't have a great way to check them.
   291  }
   292  else {
   293    LogWarning "Link has invalid format $linkUri"
   294    $linkValid = $false
   295  }
   296
   297  if ($checkLinkGuidance) {
   298    if ($linkUri.Scheme -eq 'http') {
   299      LogWarning "DO NOT use 'http' in $linkUri. Please use secure link with https instead. Check here for more information: https://aka.ms/azsdk/guideline/links"
   300      $linkValid = $false
   301    }
   302    # Check if the url is relative links, suppress the archor link validation.
   303    if (!$linkUri.IsAbsoluteUri -and !$link.StartsWith("#")) {
   304      LogWarning "DO NOT use relative link $linkUri. Please use absolute link instead. Check here for more information: https://aka.ms/azsdk/guideline/links"
   305      $linkValid = $false
   306    }
   307    # Check if the url is anchor link has any uppercase.
   308    if ($link -cmatch '#[^?]*[A-Z]') {
   309      LogWarning "Please lower case your anchor tags (i.e. anything after '#' in your link '$linkUri'. Check here for more information: https://aka.ms/azsdk/guideline/links"
   310      $linkValid = $false
   311    }
   312     # Check if link uri includes locale info.
   313    if ($linkUri -match $locale) {
   314      LogWarning "DO NOT include locale $locale information in links: $linkUri. Check here for more information: https://aka.ms/azsdk/guideline/links"
   315      $linkValid = $false
   316    }
   317  }
   318
   319  $checkedLinks[$link] = $linkValid
   320  return $linkValid
   321}
   322
   323function ReplaceGithubLink([string]$originLink) {
   324  if (!$branchReplacementName -or !$branchReplaceRegex) {
   325    return $originLink
   326  }
   327  $ReplacementPattern = "`${1}$branchReplacementName`$2"
   328  return $originLink -replace $branchReplaceRegex, $ReplacementPattern
   329}
   330
   331function GetLinks([System.Uri]$pageUri)
   332{
   333  if ($pageUri.Scheme.StartsWith("http")) {
   334    try {
   335      $response = Invoke-WebRequest -Uri $pageUri -UserAgent $userAgent -TimeoutSec $requestTimeoutSec
   336      $content = $response.Content
   337
   338      if ($pageUri.ToString().EndsWith(".md")) {
   339        $content = (ConvertFrom-MarkDown -InputObject $content).html
   340      }
   341    }
   342    catch {
   343      $statusCode = $_.Exception.Response.StatusCode.value__
   344      Write-Error "Invalid page [$statusCode] $pageUri"
   345    }
   346  }
   347  elseif ($pageUri.IsFile -and (Test-Path $pageUri.LocalPath)) {
   348    $file = $pageUri.LocalPath
   349    if ($file.EndsWith(".md")) {
   350      $content = (ConvertFrom-MarkDown $file).html
   351    }
   352    elseif ($file.EndsWith(".html")) {
   353      $content = Get-Content $file
   354    }
   355    else {
   356      if (Test-Path ($file + "index.html")) {
   357        $content = Get-Content ($file + "index.html")
   358      }
   359      else {
   360        # Fallback to just reading the content directly
   361        $content = Get-Content $file
   362      }
   363    }
   364  }
   365  else {
   366    Write-Error "Don't know how to process uri $pageUri"
   367  }
   368
   369  $links = ParseLinks $pageUri $content
   370
   371  return $links;
   372}
   373
   374if ($urls) {
   375  if ($urls.Count -eq 0) {
   376    Write-Host "Usage $($MyInvocation.MyCommand.Name) <urls>";
   377    exit 1;
   378  }
   379}
   380
   381if ($PSVersionTable.PSVersion.Major -lt 6)
   382{
   383  LogWarning "Some web requests will not work in versions of PS earlier then 6. You are running version $($PSVersionTable.PSVersion)."
   384}
   385$ignoreLinks = @();
   386if (Test-Path $ignoreLinksFile) {
   387  $ignoreLinks = (Get-Content $ignoreLinksFile).Where({ $_.Trim() -ne "" -and !$_.StartsWith("#") })
   388}
   389
   390# Use default hashtable constructor instead of @{} because we need them to be case sensitive
   391$checkedPages = New-Object Hashtable
   392$checkedLinks = New-Object Hashtable
   393
   394if ($inputCacheFile)
   395{
   396  $cacheContent = ""
   397  if ($inputCacheFile.StartsWith("http")) {
   398    try {
   399      $response = Invoke-WebRequest -Uri $inputCacheFile -TimeoutSec $requestTimeoutSec
   400      $cacheContent = $response.Content
   401    }
   402    catch {
   403      $statusCode = $_.Exception.Response.StatusCode.value__
   404      Write-Error "Failed to read cache file from  page [$statusCode] $inputCacheFile"
   405    }
   406  }
   407  elseif (Test-Path $inputCacheFile) {
   408    $cacheContent = Get-Content $inputCacheFile -Raw
   409  }
   410  $goodLinks = $cacheContent.Split("`n").Where({ $_.Trim() -ne "" -and !$_.StartsWith("#") })
   411
   412  foreach ($goodLink in $goodLinks) {
   413    $checkedLinks[$goodLink] = $true
   414  }
   415}
   416
   417$cachedLinksCount = $checkedLinks.Count
   418
   419if ($cachedLinksCount) {
   420  Write-Host "Skipping checks on $cachedLinksCount links found in the given cache of known good links."
   421}
   422
   423$badLinks = New-Object Hashtable
   424$pageUrisToCheck = new-object System.Collections.Queue
   425foreach ($url in $urls) {
   426  $uri = NormalizeUrl $url
   427  $pageUrisToCheck.Enqueue($uri);
   428}
   429
   430if ($devOpsLogging) {
   431  Write-Host "##[group]Link checking details"
   432}
   433while ($pageUrisToCheck.Count -ne 0)
   434{
   435  $pageUri = $pageUrisToCheck.Dequeue();
   436  if ($checkedPages.ContainsKey($pageUri)) { continue }
   437  $checkedPages[$pageUri] = $true;
   438
   439  $linkUris = GetLinks $pageUri
   440  Write-Host "Checking $($linkUris.Count) links found on page $pageUri";
   441  $badLinksPerPage = @();
   442  foreach ($linkUri in $linkUris) {
   443    $isLinkValid = CheckLink $linkUri
   444    if (!$isLinkValid -and !$badLinksPerPage.Contains($linkUri)) {
   445      if (!$linkUri.ToString().Trim()) {
   446        $linkUri = $emptyLinkMessage
   447      }
   448      $badLinksPerPage += $linkUri
   449    }
   450    if ($recursive -and $isLinkValid) {
   451      if ($linkUri.ToString().StartsWith($baseUrl) -and !$checkedPages.ContainsKey($linkUri)) {
   452        $pageUrisToCheck.Enqueue($linkUri);
   453      }
   454    }
   455  }
   456  if ($badLinksPerPage.Count -gt 0) {
   457    $badLinks[$pageUri] = $badLinksPerPage
   458  }
   459}
   460if ($devOpsLogging) {
   461  Write-Host "##[endgroup]"
   462}
   463
   464if ($badLinks.Count -gt 0) {
   465  Write-Host "Summary of broken links:"
   466}
   467foreach ($pageLink in $badLinks.Keys) {
   468  Write-Host "'$pageLink' has $($badLinks[$pageLink].Count) broken link(s):"
   469  foreach ($brokenLink in $badLinks[$pageLink]) {
   470    Write-Host "  $brokenLink"
   471  }
   472}
   473
   474$linksChecked = $checkedLinks.Count - $cachedLinksCount
   475
   476if ($badLinks.Count -gt 0) {
   477  Write-Host "Checked $linksChecked links with $($badLinks.Count) broken link(s) found."
   478}
   479else {
   480  Write-Host "Checked $linksChecked links. No broken links found."
   481}
   482
   483if ($outputCacheFile)
   484{
   485  $goodLinks = $checkedLinks.Keys.Where({ "True" -eq $checkedLinks[$_].ToString() }) | Sort-Object
   486
   487  Write-Host "Writing the list of validated links to $outputCacheFile"
   488  $goodLinks | Set-Content $outputCacheFile
   489}
   490
   491exit $badLinks.Count

View as plain text