I want to extract text from html source
In resulting text file i have
But why function with regular expression did not delete this
How to fix this
C#:
using System;
using System.IO;
using System.Net;
using System.Threading;
using System.Runtime.InteropServices;
using System.Text.RegularExpressions;
namespace ConsoleApplication1
{
class Program
{
public static void Main(string[] args)
{
string outputpath = Console.ReadLine();
using(StreamWriter sw = new StreamWriter(outputpath))
{
string inputpathstart = "https://literat.ug.edu.pl/faraon/";
string inputpath;
int noofchapters = 0;
inputpath = inputpathstart + String.Empty;
try
{
WebRequest request = HttpWebRequest.Create(inputpath);
WebResponse response = request.GetResponse();
System.Text.Encoding enc = System.Text.Encoding.GetEncoding ("iso-8859-2");
using(StreamReader sr = new StreamReader(response.GetResponseStream(),enc))
{
String text = sr.ReadToEnd();
while(text.Contains((noofchapters+1).ToString("D3")+".htm"))
noofchapters++;
}
}
catch (Exception e)
{
Console.WriteLine("The file could not be read");
Console.WriteLine(e.Message);
}
Console.WriteLine("{0} ",noofchapters);
for(int i=1;i<=noofchapters;i++)
{
inputpath = inputpathstart + i.ToString("D3")+".htm";
try
{
WebRequest request = HttpWebRequest.Create(inputpath);
WebResponse response = request.GetResponse();
System.Text.Encoding enc = System.Text.Encoding.GetEncoding ("iso-8859-2");
using(StreamReader sr = new StreamReader(response.GetResponseStream(),enc))
{
String text = sr.ReadToEnd();
text = System.Web.HttpUtility.HtmlDecode(text);
text = RemoveHTMLTagsCompiled(text);
sw.WriteLine(text);
sw.WriteLine();
}
}
catch (Exception e)
{
Console.WriteLine("The file could not be read");
Console.WriteLine(e.Message);
}
}
}
Console.WriteLine("Press Enter to exit");
Console.ReadKey();
}
public static string RemoveHTMLTagsCompiled(string html)
{
string s;
Regex htmlRegex = new Regex("<.*?>", RegexOptions.Compiled);
s = htmlRegex.Replace(html, string.Empty);
return s;
}
}
}
In resulting text file i have
<meta name="Keywords"
content="Prus, Bolesław Prus, Głowacki, Aleksander Głowacki, Faraon, powieść, powieść polska, roman, polish roman, literary masterpiece, kultura polska, polish culture, Polska, Poland">
But why function with regular expression did not delete this
How to fix this