Scrap all text in p tag Under heading by Python
You can use a Python library such as BeautifulSoup to scrape all the text within <p>
tags that are located under a specific heading. Here’s an example of how you could use BeautifulSoup to accomplish this:
from bs4 import BeautifulSoup import requests # Make a request to the website response = requests.get('http://yourwebsite.com') # Parse the HTML content soup = BeautifulSoup(response.content, 'html.parser') # Find the heading element heading = soup.find("h2", text="Your heading text here") # Find all p tags that are under the heading paragraphs = heading.find_next_siblings("p") # Extract the text from each p tag for p in paragraphs: print(p.text)
Scrap all text in p tag Under heading by Python
from bs4 import BeautifulSoup as bs from requests import get main = "http://www.wikipedia.org" url = "https://en.wikipedia.org/wiki/Harry_Potter_and_the_Half-Blood_Prince" response = get(url) soup = bs(response.text, 'html.parser') # collect plot in this list plot = [] # soup.find("div", { "id" : "articlebody" }) # find the node with id of "Plot" mark1 = soup.findAll("h2")[1] mark = mark1.find(lambda tag: tag.name == 'span' and tag.get('id')) # # walk through the siblings of the parent (H2) node # # until we reach the next H2 node for elt in mark.parent.nextSiblingGenerator(): if elt.name == "h2": break if hasattr(elt, "text"): plot.append(elt.text) # enjoy # print("".join(plot)) joinedPall = "".join(plot) print(joinedPall)
s
scrap all bold p under h2 or id or class of a webpage
from bs4 import BeautifulSoup as bs from requests import get # main = "http://www.wikipedia.org" url = "https://wikitravel.org/en/Bhopal" response = get(url) soup = bs(response.text, 'html.parser') # collect plot in this list plot =[] mark = soup.find(id="See") # # # walk through the siblings of the parent (H2) node # # # until we reach the next Hsz2 node for elt in mark.parent.next_siblings: if elt.name == "h2": break if hasattr(elt, "text") and (elt.find('b')): plot.append(elt.text) print(*plot,sep=('\n')) #Just to print the list in a readable way
v
scrap all p between two heding in wiki
from bs4 import BeautifulSoup as bs from requests import get main = "http://www.wikipedia.org" url = "https://en.wikipedia.org/wiki/Harry_Potter_and_the_Half-Blood_Prince" response = get(url) soup = bs(response.text, 'html.parser') # collect plot in this list plot = [] # find the node with id of "Plot" mark = soup.find(id="Plot") # walk through the siblings of the parent (H2) node # until we reach the next H2 node for elt in mark.parent.nextSiblingGenerator(): if elt.name == "h2": break if hasattr(elt, "text"): plot.append(elt.text) # enjoy print("".join(plot))
scrap all h2 and All p in a webpage
from bs4 import BeautifulSoup as bs from requests import get # read html code snippet url = 'https://feniinformation.blogspot.com/2022/04/dddd.html' response = get(url) html = bs(response.text, 'html.parser') # find the node with id of "WordSection1" mark = html.find(class_="WordSection1") # initialize container objects headers = [] content = [] # get all h2 and p(with class MsoNormal) tags find_headers = mark.find_all_next('h2') find_content = mark.find_all_next('p', {'class': 'MsoNormal'}) # get text of tags for head, cont in zip(find_headers, find_content): headers.append(head.text) content.append(cont.text) # output lists print(f"Headers = {headers} '\n\nContent = {content}") print(headers[1]) print(content[1]) print(headers[2]) print(content[2])
best code:
single-
from bs4 import BeautifulSoup as bs from requests import get # read html code snippet url = 'https://www.gobankingrates.com/reviews/security-service-fcu/' response = get(url) soup = bs(response.text, 'html.parser') allh2soup = soup.find('div', {'class':'row article-container'}) allh2 = allh2soup.find_all("h2") print(list(range(len(allh2)))) # print(allh2) thislist = list(range(len(allh2))) i = 0 while i < len(thislist): print(thislist[i]) i = i + 1 h3plot1 = [] plot1 = [] ulplot1 = [] olplot1 = [] blockquoteplot1 = [] mark1 = soup.find_all("h2")[4] print(mark1.text) for h2 in mark1: for elt in h2.parent.nextSiblingGenerator(): if elt.name == "h2": break if elt.name == "h3": h3plot1.append(elt.text) if elt.name == "p": plot1.append(elt.text) if elt.name == "ul": ulplot1.append(elt.text) if elt.name == "ol": olplot1.append(elt.text) if elt.name == "blockquote": blockquoteplot1.append(elt.text) # if hasattr(elt, "text") and (elt.find('b')): # print(elt) # if elt.name == "p": # print(elt) print(h3plot1) joinedplot1 = "".join(plot1) print(joinedplot1) joinedulolplot1 = "".join(ulplot1 + olplot1) print(joinedulolplot1) joinedblockquoteplot1 = "".join(blockquoteplot1) print(joinedblockquoteplot1) # if r.status_code == 201:
bulk1fully-
from bs4 import BeautifulSoup as bs from requests import get # read html code snippet url = 'https://www.gobankingrates.com/reviews/security-service-fcu/' response = get(url) soup = bs(response.text, 'html.parser') allh2soup = soup.find('div', {'class':'row article-container'}) allh2 = allh2soup.find_all("h2") print(list(range(len(allh2)))) # print(allh2) # thislist = list(range(len(allh2))) # i = 0 # while i < len(thislist): # print(thislist[i]) # i = i + 1 thislist = list(range(len(allh2))) i = 11 while i < len(thislist): h3plot1 = [] plot1 = [] ulplot1 = [] olplot1 = [] blockquoteplot1 = [] mark1 = soup.find_all("h2")[i] print(mark1.text) for h2 in mark1: for elt in h2.parent.nextSiblingGenerator(): if elt.name == "h2": break if elt.name == "h3": h3plot1.append(elt.text) if elt.name == "p": plot1.append(elt.text) if elt.name == "ul": ulplot1.append(elt.text) if elt.name == "ol": olplot1.append(elt.text) if elt.name == "blockquote": blockquoteplot1.append(elt.text) # if hasattr(elt, "text") and (elt.find('b')): # print(elt) # if elt.name == "p": # print(elt) print(h3plot1) joinedplot1 = "".join(plot1) print(joinedplot1) joinedulolplot1 = "".join(ulplot1 + olplot1) print(joinedulolplot1) joinedblockquoteplot1 = "".join(blockquoteplot1) print(joinedblockquoteplot1) i = i + 1 # if r.status_code == 201:
bulk2-